Max191/fusion_results.mlir

## fusion_results.mlir

// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
  %4 = tensor.empty() : tensor<11008x32x128xf32>
  %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
  %collapsed_0 = tensor.collapse_shape %2 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %collapsed, %collapsed_0 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%4 : tensor<11008x32x128xf32>) {
  ^bb0(%in: i4, %in_2: f32, %in_3: f32, %out: f32):
    %10 = arith.extui %in : i4 to i32
    %11 = arith.uitofp %10 : i32 to f32
    %12 = arith.subf %11, %in_3 : f32
    %13 = arith.mulf %12, %in_2 : f32
    linalg.yield %13 : f32
  } -> tensor<11008x32x128xf32>
  %collapsed_1 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x1x32x128xf32> into tensor<32x128xf32>
  %6 = tensor.empty() : tensor<11008xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<11008xf32>) -> tensor<11008xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%collapsed_1, %5 : tensor<32x128xf32>, tensor<11008x32x128xf32>) outs(%7 : tensor<11008xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %10 = arith.mulf %in, %in_2 : f32
    %11 = arith.addf %10, %out : f32
    linalg.yield %11 : f32
  } -> tensor<11008xf32>
  %expanded = tensor.expand_shape %8 [[0, 1, 2]] : tensor<11008xf32> into tensor<1x1x11008xf32>
  %9 = hal.tensor.export %expanded "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
  return %9 : !hal.buffer_view
}

// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 1.270000e+02 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0_i16 = arith.constant 0 : i16
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
  %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
  %collapsed_1 = tensor.collapse_shape %2 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
  %collapsed_2 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x1x32x128xf32> into tensor<32x128xf32>
  %4 = tensor.empty() : tensor<11008xf32>
  %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<11008xf32>) -> tensor<11008xf32>
  %6 = tensor.empty() : tensor<32xf32>
  %7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<32xf32>) -> tensor<32xf32>
  %8 = tensor.empty() : tensor<32xf32>
  %9 = tensor.empty() : tensor<32xf32>
  %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<32xf32>) -> tensor<32xf32>
  %11 = tensor.empty() : tensor<32x128xi8>
  %12:3 = flow.dispatch.region -> (tensor<32x128xi8>, tensor<32xf32>, tensor<32xf32>) {
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed_2 : tensor<32x128xf32>) outs(%7 : tensor<32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %21 = math.absf %in : f32
      %22 = arith.maxf %21, %out : f32
      linalg.yield %22 : f32
    } -> tensor<32xf32>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%17 : tensor<32xf32>) outs(%8 : tensor<32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %21 = arith.divf %in, %cst : f32
      linalg.yield %21 : f32
    } -> tensor<32xf32>
    %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed_2 : tensor<32x128xf32>) outs(%10 : tensor<32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %21 = arith.addf %in, %out : f32
      linalg.yield %21 : f32
    } -> tensor<32xf32>
    %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_2, %18 : tensor<32x128xf32>, tensor<32xf32>) outs(%11 : tensor<32x128xi8>) {
    ^bb0(%in: f32, %in_3: f32, %out: i8):
      %21 = arith.divf %in, %in_3 : f32
      %22 = arith.fptoui %21 : f32 to i8
      linalg.yield %22 : i8
    } -> tensor<32x128xi8>
    flow.return %20, %19, %18 : tensor<32x128xi8>, tensor<32xf32>, tensor<32xf32>
  }
  %13 = tensor.empty() : tensor<11008x32xi16>
  %14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<11008x32xi16>) -> tensor<11008x32xi16>
  %15 = flow.dispatch.region -> (tensor<11008xf32>) {
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<11008x32xi16>) {
    ^bb0(%in: i8, %in_3: i4, %out: i16):
      %19 = arith.extsi %in : i8 to i16
      %20 = arith.extui %in_3 : i4 to i16
      %21 = arith.muli %19, %20 : i16
      %22 = arith.addi %21, %out : i16
      linalg.yield %22 : i16
    } -> tensor<11008x32xi16>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%17, %12#2, %12#1, %collapsed, %collapsed_1 : tensor<11008x32xi16>, tensor<32xf32>, tensor<32xf32>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%5 : tensor<11008xf32>) {
    ^bb0(%in: i16, %in_3: f32, %in_4: f32, %in_5: f32, %in_6: f32, %out: f32):
      %19 = arith.extsi %in : i16 to i32
      %20 = arith.sitofp %19 : i32 to f32
      %21 = arith.mulf %20, %in_3 : f32
      %22 = arith.mulf %21, %in_5 : f32
      %23 = arith.mulf %in_6, %in_5 : f32
      %24 = arith.mulf %23, %in_4 : f32
      %25 = arith.subf %22, %24 : f32
      %26 = arith.addf %25, %out : f32
      linalg.yield %26 : f32
    } -> tensor<11008xf32>
    flow.return %18 : tensor<11008xf32>
  }
  %expanded = tensor.expand_shape %15 [[0, 1, 2]] : tensor<11008xf32> into tensor<1x1x11008xf32>
  %16 = hal.tensor.export %expanded "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
  return %16 : !hal.buffer_view
}

	// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
	func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
	%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
	%4 = tensor.empty() : tensor<11008x32x128xf32>
	%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
	%collapsed_0 = tensor.collapse_shape %2 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
	%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %collapsed, %collapsed_0 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%4 : tensor<11008x32x128xf32>) {
	^bb0(%in: i4, %in_2: f32, %in_3: f32, %out: f32):
	%10 = arith.extui %in : i4 to i32
	%11 = arith.uitofp %10 : i32 to f32
	%12 = arith.subf %11, %in_3 : f32
	%13 = arith.mulf %12, %in_2 : f32
	linalg.yield %13 : f32
	} -> tensor<11008x32x128xf32>
	%collapsed_1 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x1x32x128xf32> into tensor<32x128xf32>
	%6 = tensor.empty() : tensor<11008xf32>
	%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<11008xf32>) -> tensor<11008xf32>
	%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%collapsed_1, %5 : tensor<32x128xf32>, tensor<11008x32x128xf32>) outs(%7 : tensor<11008xf32>) {
	^bb0(%in: f32, %in_2: f32, %out: f32):
	%10 = arith.mulf %in, %in_2 : f32
	%11 = arith.addf %10, %out : f32
	linalg.yield %11 : f32
	} -> tensor<11008xf32>
	%expanded = tensor.expand_shape %8 [[0, 1, 2]] : tensor<11008xf32> into tensor<1x1x11008xf32>
	%9 = hal.tensor.export %expanded "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
	return %9 : !hal.buffer_view
	}

	// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- //
	func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
	%cst = arith.constant 1.270000e+02 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%c0_i16 = arith.constant 0 : i16
	%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
	%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
	%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
	%collapsed_1 = tensor.collapse_shape %2 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
	%collapsed_2 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x1x32x128xf32> into tensor<32x128xf32>
	%4 = tensor.empty() : tensor<11008xf32>
	%5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<11008xf32>) -> tensor<11008xf32>
	%6 = tensor.empty() : tensor<32xf32>
	%7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<32xf32>) -> tensor<32xf32>
	%8 = tensor.empty() : tensor<32xf32>
	%9 = tensor.empty() : tensor<32xf32>
	%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<32xf32>) -> tensor<32xf32>
	%11 = tensor.empty() : tensor<32x128xi8>
	%12:3 = flow.dispatch.region -> (tensor<32x128xi8>, tensor<32xf32>, tensor<32xf32>) {
	%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed_2 : tensor<32x128xf32>) outs(%7 : tensor<32xf32>) {
	^bb0(%in: f32, %out: f32):
	%21 = math.absf %in : f32
	%22 = arith.maxf %21, %out : f32
	linalg.yield %22 : f32
	} -> tensor<32xf32>
	%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%17 : tensor<32xf32>) outs(%8 : tensor<32xf32>) {
	^bb0(%in: f32, %out: f32):
	%21 = arith.divf %in, %cst : f32
	linalg.yield %21 : f32
	} -> tensor<32xf32>
	%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed_2 : tensor<32x128xf32>) outs(%10 : tensor<32xf32>) {
	^bb0(%in: f32, %out: f32):
	%21 = arith.addf %in, %out : f32
	linalg.yield %21 : f32
	} -> tensor<32xf32>
	%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_2, %18 : tensor<32x128xf32>, tensor<32xf32>) outs(%11 : tensor<32x128xi8>) {
	^bb0(%in: f32, %in_3: f32, %out: i8):
	%21 = arith.divf %in, %in_3 : f32
	%22 = arith.fptoui %21 : f32 to i8
	linalg.yield %22 : i8
	} -> tensor<32x128xi8>
	flow.return %20, %19, %18 : tensor<32x128xi8>, tensor<32xf32>, tensor<32xf32>
	}
	%13 = tensor.empty() : tensor<11008x32xi16>
	%14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<11008x32xi16>) -> tensor<11008x32xi16>
	%15 = flow.dispatch.region -> (tensor<11008xf32>) {
	%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<11008x32xi16>) {
	^bb0(%in: i8, %in_3: i4, %out: i16):
	%19 = arith.extsi %in : i8 to i16
	%20 = arith.extui %in_3 : i4 to i16
	%21 = arith.muli %19, %20 : i16
	%22 = arith.addi %21, %out : i16
	linalg.yield %22 : i16
	} -> tensor<11008x32xi16>
	%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%17, %12#2, %12#1, %collapsed, %collapsed_1 : tensor<11008x32xi16>, tensor<32xf32>, tensor<32xf32>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%5 : tensor<11008xf32>) {
	^bb0(%in: i16, %in_3: f32, %in_4: f32, %in_5: f32, %in_6: f32, %out: f32):
	%19 = arith.extsi %in : i16 to i32
	%20 = arith.sitofp %19 : i32 to f32
	%21 = arith.mulf %20, %in_3 : f32
	%22 = arith.mulf %21, %in_5 : f32
	%23 = arith.mulf %in_6, %in_5 : f32
	%24 = arith.mulf %23, %in_4 : f32
	%25 = arith.subf %22, %24 : f32
	%26 = arith.addf %25, %out : f32
	linalg.yield %26 : f32
	} -> tensor<11008xf32>
	flow.return %18 : tensor<11008xf32>
	}
	%expanded = tensor.expand_shape %15 [[0, 1, 2]] : tensor<11008xf32> into tensor<1x1x11008xf32>
	%16 = hal.tensor.export %expanded "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
	return %16 : !hal.buffer_view
	}