vmurali/crap

## crap
// -----// IR Dump Before TosaToSCF (tosa-to-scf) //----- //
func.func @tensor_float() {
  %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToSCF (tosa-to-scf) //----- //
func.func @tensor_float() {
  %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TopLevelSCFToCFG (iree-top-level-scf-to-cfg) //----- //
func.func @tensor_float() {
  %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TopLevelSCFToCFG (iree-top-level-scf-to-cfg) //----- //
func.func @tensor_float() {
  %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Inliner (inline) //----- //
module {
  func.func @tensor_float() {
    %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
    %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
    check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Inliner (inline) //----- //
module {
  func.func @tensor_float() {
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
    check.expect_eq(%3, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToArith (tosa-to-arith) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToArith (tosa-to-arith) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToTensor (tosa-to-tensor) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToTensor (tosa-to-tensor) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToLinalgExt (iree-tosa-to-linalg-ext) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToLinalgExt (iree-tosa-to-linalg-ext) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaOptionalDecompositions (tosa-optional-decompositions) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaOptionalDecompositions (tosa-optional-decompositions) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToLinalgNamed (tosa-to-linalg-named) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq(%3, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToLinalgNamed (tosa-to-linalg-named) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %cst_3 = arith.constant 0.000000e+00 : f32
  %4 = linalg.fill ins(%cst_3 : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %cst_4 = arith.constant dense<[1, 0]> : tensor<2xi64>
  %5 = "tosa.transpose"(%1, %cst_4) : (tensor<1x3xf32>, tensor<2xi64>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %9 = arith.addf %in, %in_5 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %cst_3 = arith.constant 0.000000e+00 : f32
  %4 = linalg.fill ins(%cst_3 : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %cst_4 = arith.constant dense<[1, 0]> : tensor<2xi64>
  %5 = "tosa.transpose"(%1, %cst_4) : (tensor<1x3xf32>, tensor<2xi64>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %9 = arith.addf %in, %in_5 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaLayerwiseConstantFoldPass (tosa-layerwise-constant-fold) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaLayerwiseConstantFoldPass (tosa-layerwise-constant-fold) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaValidation (tosa-validate) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaValidation (tosa-validate) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToLinalg (tosa-to-linalg) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToArith (tosa-to-arith) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToArith (tosa-to-arith) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before TosaToTensor (tosa-to-tensor) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %5 = "tosa.reshape"(%1) {new_shape = array<i64: 3, 1>} : (tensor<1x3xf32>) -> tensor<3x1xf32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.matmul ins(%0, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %7 : tensor<1xf32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %9 = arith.addf %in, %in_4 : f32
    linalg.yield %9 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%8, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After TosaToTensor (tosa-to-tensor) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before StripSignedness (iree-flow-strip-signedness) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After StripSignedness (iree-flow-strip-signedness) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-linalg-quantized-matmul-to-matmul) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-linalg-quantized-matmul-to-matmul) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-linalg-quantized-conv-to-conv) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-linalg-quantized-conv-to-conv) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before VerifyCompilerTOSAInputLegality (iree-tosa-verify-compiler-input-legality) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After VerifyCompilerTOSAInputLegality (iree-tosa-verify-compiler-input-legality) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IREEImportPublic (iree-import-public) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IREEImportPublic (iree-import-public) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before ImportMLProgram (iree-import-ml-program) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before SanitizeModuleNames (iree-sanitize-module-names) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Inliner (inline) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Inliner (inline) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = tensor.empty() : tensor<2x1xf32>
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = tensor.empty() : tensor<2x1xf32>
  %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<1xf32>, tensor<2x1xf32>) outs(%5 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %8 = arith.addf %in, %in_4 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before EraseUnusedLinalgOperands (iree-flow-erase-unused-linalg-operands) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After EraseUnusedLinalgOperands (iree-flow-erase-unused-linalg-operands) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before VerifyInputLegality (iree-verify-input-legality) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before ExpandTensorShapes (iree-flow-expand-tensor-shapes) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ExpandTensorShapes (iree-flow-expand-tensor-shapes) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- //
#map = affine_map<(d0, d1) -> (d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
    %3 = tensor.empty() : tensor<2x1xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
    %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %7 = arith.addf %in, %in_4 : f32
      linalg.yield %7 : f32
    } -> tensor<2x1xf32>
    check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before RaiseSpecialOps (iree-flow-raise-special-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<1xf32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %7 = arith.addf %in, %in_4 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After RaiseSpecialOps (iree-flow-raise-special-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before RaiseSpecialOps (iree-flow-raise-special-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before LinalgDetensorize (linalg-detensorize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After RaiseSpecialOps (iree-flow-raise-special-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After LinalgDetensorize (linalg-detensorize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CollapseDims (iree-flow-collapse-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %collapsed_5 = tensor.collapse_shape %5 [[0, 1]] : tensor<2x1xf32> into tensor<2xf32>
  %6 = tensor.empty() : tensor<2xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_4, %collapsed_5 : tensor<f32>, tensor<2xf32>) outs(%6 : tensor<2xf32>) {
  ^bb0(%in: f32, %in_7: f32, %out: f32):
    %8 = arith.addf %in, %in_7 : f32
    linalg.yield %8 : f32
  } -> tensor<2xf32>
  %expanded_6 = tensor.expand_shape %7 [[0, 1]] : tensor<2xf32> into tensor<2x1xf32>
  check.expect_eq(%expanded_6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %8 = arith.addf %in, %in_5 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before LinalgDetensorize (linalg-detensorize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %8 = arith.addf %in, %in_5 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After LinalgDetensorize (linalg-detensorize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %8 = arith.addf %in, %in_5 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %8 = arith.addf %in, %in_5 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %8 = arith.addf %in, %in_5 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = tensor.empty() : tensor<2x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%6 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %8 = arith.addf %in, %in_5 : f32
    linalg.yield %8 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CollapseDims (iree-flow-collapse-dims) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %5 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
  ^bb0(%in: f32, %in_5: f32, %out: f32):
    %7 = arith.addf %in, %in_5 : f32
    linalg.yield %7 : f32
  } -> tensor<2x1xf32>
  check.expect_eq(%6, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %c1_5 = arith.constant 1 : index
  %5 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %c2, %c1_5]
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %c1_8 = arith.constant 1 : index
  %6 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_6, %c1_7, %c1_8]
  %7 = flow.dispatch.region[%5, %6] -> (tensor<2x1xf32>) {
    %8 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %8 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_9: f32, %out: f32):
      %10 = arith.addf %in, %in_9 : f32
      linalg.yield %10 : f32
    } -> tensor<2x1xf32>
    flow.return %9 : tensor<2x1xf32>
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %c1_5 = arith.constant 1 : index
  %5 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %c2, %c1_5]
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %c1_8 = arith.constant 1 : index
  %6 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_6, %c1_7, %c1_8]
  %7 = flow.dispatch.region[%5, %6] -> (tensor<2x1xf32>) {
    %8 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %8 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_9: f32, %out: f32):
      %10 = arith.addf %in, %in_9 : f32
      linalg.yield %10 : f32
    } -> tensor<2x1xf32>
    flow.return %9 : tensor<2x1xf32>
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%7, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %5 = flow.dispatch.region[%c2, %c1] -> (tensor<2x1xf32>) {
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %6 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_5: f32, %out: f32):
      %8 = arith.addf %in, %in_5 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    flow.return %7 : tensor<2x1xf32>
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_2 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_3 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_3 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_2 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_1 : tensor<1xf32>
  %3 = tensor.empty() : tensor<2x1xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
  %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x3xf32> into tensor<3xf32>
  %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<3xf32> into tensor<3x1xf32>
  %collapsed_4 = tensor.collapse_shape %2 [] : tensor<1xf32> into tensor<f32>
  %5 = flow.dispatch.region[%c2, %c1] -> (tensor<2x1xf32>) {
    %6 = linalg.matmul ins(%0, %expanded : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_4, %6 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_5: f32, %out: f32):
      %8 = arith.addf %in, %in_5 : f32
      linalg.yield %8 : f32
    } -> tensor<2x1xf32>
    flow.return %7 : tensor<2x1xf32>
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst_0) : tensor<2x1xf32>
  return
}

// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
    %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
    %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %9 = tensor.empty() : tensor<2x1xf32>
    %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %13 = arith.addf %in, %in_4 : f32
      linalg.yield %13 : f32
    } -> tensor<2x1xf32>
    flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
    flow.return
  } count(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch.workgroups[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32> =
        (%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst_3 = arith.constant 0.000000e+00 : f32
      %6 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %7 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %8 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %9 = tensor.empty() : tensor<2x1xf32>
      %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %11 = linalg.matmul ins(%6, %7 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %12 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%8, %11 : tensor<f32>, tensor<2x1xf32>) outs(%9 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %13 = arith.addf %in, %in_4 : f32
        linalg.yield %13 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %12, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      flow.return
    } count(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @_tensor_float_dispatch_0 {
  flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %3 = tensor.empty() : tensor<2x1xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %7 = arith.addf %in, %in_0 : f32
        linalg.yield %7 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @_tensor_float_dispatch_0 {
  flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %3 = tensor.empty() : tensor<2x1xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %7 = arith.addf %in, %in_0 : f32
        linalg.yield %7 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before DeduplicateExecutables (iree-flow-deduplicate-executables) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
flow.executable private @_tensor_float_dispatch_0 {
  flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %3 = tensor.empty() : tensor<2x1xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %7 = arith.addf %in, %in_0 : f32
        linalg.yield %7 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump Before CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @_tensor_float_dispatch_0 {
  flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %3 = tensor.empty() : tensor<2x1xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %7 = arith.addf %in, %in_0 : f32
        linalg.yield %7 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump Before CSE (cse) //----- //
flow.executable private @_tensor_float_dispatch_0 {
  flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %3 = tensor.empty() : tensor<2x1xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %7 = arith.addf %in, %in_0 : f32
        linalg.yield %7 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @_tensor_float_dispatch_0 {
  flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %3 = tensor.empty() : tensor<2x1xf32>
      %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %7 = arith.addf %in, %in_0 : f32
        linalg.yield %7 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
  %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %cst) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before VerifyInput (iree-stream-verify-input) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After VerifyInput (iree-stream-verify-input) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before OutlineConstants (iree-stream-outline-constants) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %cst_1 = arith.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_2 = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = util.optimization_barrier %cst_2 : tensor<2x3xf32>
    %1 = util.optimization_barrier %cst_1 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst_0 : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %cst) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After OutlineConstants (iree-stream-outline-constants) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %_constant = util.global.load @_constant : tensor<2x1xf32>
  %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
  %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
  %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
  %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %_constant) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %_constant = util.global.load @_constant : tensor<2x1xf32>
  %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
  %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
  %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
  %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %_constant) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %_constant = util.global.load @_constant : tensor<2x1xf32>
  %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
  %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
  %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
  %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %_constant) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %_constant = util.global.load @_constant : tensor<2x1xf32>
  %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
  %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
  %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
  %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %_constant) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %_constant = util.global.load @_constant : tensor<2x1xf32>
  %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
  %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
  %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
  %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %_constant) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %_constant = util.global.load @_constant : tensor<2x1xf32>
  %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
  %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
  %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
  %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
  %2 = util.optimization_barrier %cst : tensor<1xf32>
  %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
  %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
  %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
  check.expect_eq(%5, %_constant) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before ConvertToStream (iree-stream-conversion) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant {noinline} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global private @_constant_0 {noinline} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global private @_constant_1 {noinline} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  flow.executable private @_tensor_float_dispatch_0 {
    flow.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !flow.dispatch.tensor<readonly:tensor<2x3xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<3x1xf32>>, %arg2: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %3 = tensor.empty() : tensor<2x1xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%4 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2, %5 : tensor<f32>, tensor<2x1xf32>) outs(%3 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %7 = arith.addf %in, %in_0 : f32
          linalg.yield %7 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
    %_constant = util.global.load @_constant : tensor<2x1xf32>
    %_constant_0 = util.global.load @_constant_0 : tensor<1x3xf32>
    %_constant_1 = util.global.load @_constant_1 : tensor<2x3xf32>
    %0 = util.optimization_barrier %_constant_1 : tensor<2x3xf32>
    %1 = util.optimization_barrier %_constant_0 : tensor<1x3xf32>
    %2 = util.optimization_barrier %cst : tensor<1xf32>
    %3 = flow.tensor.reshape %1 : tensor<1x3xf32> -> tensor<3x1xf32>
    %4 = flow.tensor.reshape %2 : tensor<1xf32> -> tensor<f32>
    %5 = flow.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%0, %3, %4) : (tensor<2x3xf32>, tensor<3x1xf32>, tensor<f32>) -> tensor<2x1xf32>
    check.expect_eq(%5, %_constant) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ConvertToStream (iree-stream-conversion) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = stream.tensor.constant : tensor<1xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.resource.size %6 : !stream.resource<*>
    %9 = stream.tensor.sizeof tensor<3x1xf32> : index
    %10 = stream.tensor.clone %6 : tensor<1x3xf32> in !stream.resource<*>{%8} -> tensor<3x1xf32> in !stream.resource<*>{%9}
    %11 = stream.resource.size %7 : !stream.resource<*>
    %12 = stream.tensor.sizeof tensor<f32> : index
    %13 = stream.tensor.clone %7 : tensor<1xf32> in !stream.resource<*>{%11} -> tensor<f32> in !stream.resource<*>{%12}
    %c0 = arith.constant 0 : index
    %14 = stream.resource.size %5 : !stream.resource<*>
    %15 = stream.tensor.sizeof tensor<2x1xf32> : index
    %16 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %14 for %14], %10[%c0 to %9 for %9], %13[%c0 to %12 for %12]) : (!stream.resource<*>{%14}, !stream.resource<*>{%9}, !stream.resource<*>{%12}) -> !stream.resource<*>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%15} -> tensor<2x1xf32>
    %19 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %20 = stream.tensor.export %19 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%18, %20) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = stream.tensor.constant : tensor<1xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.resource.size %6 : !stream.resource<*>
    %9 = stream.tensor.sizeof tensor<3x1xf32> : index
    %10 = stream.tensor.clone %6 : tensor<1x3xf32> in !stream.resource<*>{%8} -> tensor<3x1xf32> in !stream.resource<*>{%9}
    %11 = stream.resource.size %7 : !stream.resource<*>
    %12 = stream.tensor.sizeof tensor<f32> : index
    %13 = stream.tensor.clone %7 : tensor<1xf32> in !stream.resource<*>{%11} -> tensor<f32> in !stream.resource<*>{%12}
    %c0 = arith.constant 0 : index
    %14 = stream.resource.size %5 : !stream.resource<*>
    %15 = stream.tensor.sizeof tensor<2x1xf32> : index
    %16 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %14 for %14], %10[%c0 to %9 for %9], %13[%c0 to %12 for %12]) : (!stream.resource<*>{%14}, !stream.resource<*>{%9}, !stream.resource<*>{%12}) -> !stream.resource<*>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%15} -> tensor<2x1xf32>
    %19 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %20 = stream.tensor.export %19 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%18, %20) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %cst = stream.tensor.constant : tensor<1xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.resource.size %6 : !stream.resource<*>
    %9 = stream.tensor.sizeof tensor<3x1xf32> : index
    %10 = stream.tensor.clone %6 : tensor<1x3xf32> in !stream.resource<*>{%8} -> tensor<3x1xf32> in !stream.resource<*>{%9}
    %11 = stream.resource.size %7 : !stream.resource<*>
    %12 = stream.tensor.sizeof tensor<f32> : index
    %13 = stream.tensor.clone %7 : tensor<1xf32> in !stream.resource<*>{%11} -> tensor<f32> in !stream.resource<*>{%12}
    %c0 = arith.constant 0 : index
    %14 = stream.resource.size %5 : !stream.resource<*>
    %15 = stream.tensor.sizeof tensor<2x1xf32> : index
    %16 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %14 for %14], %10[%c0 to %9 for %9], %13[%c0 to %12 for %12]) : (!stream.resource<*>{%14}, !stream.resource<*>{%9}, !stream.resource<*>{%12}) -> !stream.resource<*>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%15} -> tensor<2x1xf32>
    %19 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %20 = stream.tensor.export %19 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%18, %20) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_0 : !stream.resource<constant>
  util.global.store %0, @_constant_0__size : index
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_1 : !stream.resource<constant>
  util.global.store %0, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %cst = stream.tensor.constant : tensor<1xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.resource.size %6 : !stream.resource<*>
  %9 = stream.tensor.sizeof tensor<3x1xf32> : index
  %10 = stream.tensor.clone %6 : tensor<1x3xf32> in !stream.resource<*>{%8} -> tensor<3x1xf32> in !stream.resource<*>{%9}
  %11 = stream.resource.size %7 : !stream.resource<*>
  %12 = stream.tensor.sizeof tensor<f32> : index
  %13 = stream.tensor.clone %7 : tensor<1xf32> in !stream.resource<*>{%11} -> tensor<f32> in !stream.resource<*>{%12}
  %c0 = arith.constant 0 : index
  %14 = stream.resource.size %5 : !stream.resource<*>
  %15 = stream.tensor.sizeof tensor<2x1xf32> : index
  %16 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %14 for %14], %10[%c0 to %9 for %9], %13[%c0 to %12 for %12]) : (!stream.resource<*>{%14}, !stream.resource<*>{%9}, !stream.resource<*>{%12}) -> !stream.resource<*>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%15} -> tensor<2x1xf32>
  %19 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %20 = stream.tensor.export %19 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%18, %20) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_0 : !stream.resource<constant>
  util.global.store %0, @_constant_0__size : index
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_0 : !stream.resource<constant>
  util.global.store %0, @_constant_0__size : index
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %0 = stream.tensor.sizeof tensor<1xf32> : index
  %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.tensor.sizeof tensor<3x1xf32> : index
  %9 = stream.tensor.sizeof tensor<f32> : index
  %10 = stream.resource.size %5 : !stream.resource<*>
  %11 = stream.tensor.sizeof tensor<2x1xf32> : index
  %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
  %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
  %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%14, %16) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %0 = stream.tensor.sizeof tensor<1xf32> : index
  %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.tensor.sizeof tensor<3x1xf32> : index
  %9 = stream.tensor.sizeof tensor<f32> : index
  %10 = stream.resource.size %5 : !stream.resource<*>
  %11 = stream.tensor.sizeof tensor<2x1xf32> : index
  %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
  %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
  %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%14, %16) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_1 : !stream.resource<constant>
  util.global.store %0, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_1 : !stream.resource<constant>
  util.global.store %0, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_0 : !stream.resource<constant>
  util.global.store %0, @_constant_0__size : index
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_0 : !stream.resource<constant>
  util.global.store %0, @_constant_0__size : index
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %0 = stream.tensor.sizeof tensor<1xf32> : index
  %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.tensor.sizeof tensor<3x1xf32> : index
  %9 = stream.tensor.sizeof tensor<f32> : index
  %10 = stream.resource.size %5 : !stream.resource<*>
  %11 = stream.tensor.sizeof tensor<2x1xf32> : index
  %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
  %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
  %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%14, %16) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %0 = stream.tensor.sizeof tensor<1xf32> : index
  %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.tensor.sizeof tensor<3x1xf32> : index
  %9 = stream.tensor.sizeof tensor<f32> : index
  %10 = stream.resource.size %5 : !stream.resource<*>
  %11 = stream.tensor.sizeof tensor<2x1xf32> : index
  %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
  %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
  %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%14, %16) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_1 : !stream.resource<constant>
  util.global.store %0, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_1 : !stream.resource<constant>
  util.global.store %0, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_0 : !stream.resource<constant>
  util.global.store %0, @_constant_0__size : index
  util.initializer.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %0 = stream.tensor.sizeof tensor<1xf32> : index
  %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.tensor.sizeof tensor<3x1xf32> : index
  %9 = stream.tensor.sizeof tensor<f32> : index
  %10 = stream.resource.size %5 : !stream.resource<*>
  %11 = stream.tensor.sizeof tensor<2x1xf32> : index
  %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
  %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
  %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%14, %16) : tensor<2x1xf32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant_1 : !stream.resource<constant>
  util.global.store %0, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %cst = arith.constant 1.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before CombineInitializers (iree-util-combine-initializers) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    util.initializer.return
  }
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_0 : !stream.resource<constant>
    util.global.store %0, @_constant_0__size : index
    util.initializer.return
  }
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant_1 : !stream.resource<constant>
    util.global.store %0, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %0, @_constant__size : index
    %cst_0 = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %1 = stream.resource.size %cst_0 : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %1, @_constant_0__size : index
    %cst_1 = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    %2 = stream.resource.size %cst_1 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.global.store %2, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.tensor.sizeof tensor<1xf32> : index
    %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
    %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %5 = util.optimization_barrier %4 : !stream.resource<*>
    %6 = util.optimization_barrier %3 : !stream.resource<*>
    %7 = util.optimization_barrier %1 : !stream.resource<*>
    %8 = stream.tensor.sizeof tensor<3x1xf32> : index
    %9 = stream.tensor.sizeof tensor<f32> : index
    %10 = stream.resource.size %5 : !stream.resource<*>
    %11 = stream.tensor.sizeof tensor<2x1xf32> : index
    %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
    %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
    %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
    %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%14, %16) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
util.initializer {
  %cst = stream.tensor.constant : tensor<2x1xf32> in !stream.resource<constant> = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %0 = stream.resource.size %cst : !stream.resource<constant>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %0, @_constant__size : index
  %cst_0 = stream.tensor.constant : tensor<1x3xf32> in !stream.resource<constant> = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %1 = stream.resource.size %cst_0 : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %1, @_constant_0__size : index
  %cst_1 = stream.tensor.constant : tensor<2x3xf32> in !stream.resource<constant> = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %2 = stream.resource.size %cst_1 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %2, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before EncodeDeviceTensors (iree-stream-encode-device-tensors) //----- //
stream.executable private @_tensor_float_dispatch_0 {
  stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
      %cst = arith.constant 0.000000e+00 : f32
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
      %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %7 = tensor.empty() : tensor<2x1xf32>
      %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %11 = arith.addf %in, %in_0 : f32
        linalg.yield %11 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump Before EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
func.func private @_tensor_float() {
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 1.000000e+00 : f32
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.tensor.sizeof tensor<1xf32> : index
  %1 = stream.tensor.splat %cst : f32 -> tensor<1xf32> in !stream.resource<*>{%0}
  %2 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %3 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %4 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %5 = util.optimization_barrier %4 : !stream.resource<*>
  %6 = util.optimization_barrier %3 : !stream.resource<*>
  %7 = util.optimization_barrier %1 : !stream.resource<*>
  %8 = stream.tensor.sizeof tensor<3x1xf32> : index
  %9 = stream.tensor.sizeof tensor<f32> : index
  %10 = stream.resource.size %5 : !stream.resource<*>
  %11 = stream.tensor.sizeof tensor<2x1xf32> : index
  %12 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%5[%c0 to %10 for %10], %6[%c0 to %8 for %8], %7[%c0 to %9 for %9]) : (!stream.resource<*>{%10}, !stream.resource<*>{%8}, !stream.resource<*>{%9}) -> !stream.resource<*>{%11}
  %13 = stream.async.transfer %12 : !stream.resource<*>{%11} -> !stream.resource<external>{%11}
  %14 = stream.tensor.export %13 : tensor<2x1xf32> in !stream.resource<external>{%11} -> tensor<2x1xf32>
  %15 = stream.async.transfer %2 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%14, %16) : tensor<2x1xf32>
  return
}

// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %c8, @_constant__size : index
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After EncodeDeviceTensors (iree-stream-encode-device-tensors) //----- //
stream.executable private @_tensor_float_dispatch_0 {
  stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
      %cst = arith.constant 0.000000e+00 : f32
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
      %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
      %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
      %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
      %7 = tensor.empty() : tensor<2x1xf32>
      %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
      %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
      ^bb0(%in: f32, %in_0: f32, %out: f32):
        %11 = arith.addf %in, %in_0 : f32
        linalg.yield %11 : f32
      } -> tensor<2x1xf32>
      flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
      return
    }
  }
}

// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %1 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %2 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %3 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %2 : !stream.resource<*>
  %6 = util.optimization_barrier %0 : !stream.resource<*>
  %7 = stream.resource.size %4 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%4[%c0 to %7 for %7], %5[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %1 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before MaterializeBuiltins (iree-stream-materialize-builtins) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %c8, @_constant__size : index
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %c12, @_constant_0__size : index
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.global.store %c24, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %1 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %2 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %3 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %2 : !stream.resource<*>
    %6 = util.optimization_barrier %0 : !stream.resource<*>
    %7 = stream.resource.size %4 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%4[%c0 to %7 for %7], %5[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %1 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After MaterializeBuiltins (iree-stream-materialize-builtins) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %c8, @_constant__size : index
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %c12, @_constant_0__size : index
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.global.store %c24, @_constant_1__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %1 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %2 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %3 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %2 : !stream.resource<*>
    %6 = util.optimization_barrier %0 : !stream.resource<*>
    %7 = stream.resource.size %4 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%4[%c0 to %7 for %7], %5[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %1 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %c8, @_constant__size : index
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %1 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %2 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %3 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %2 : !stream.resource<*>
  %6 = util.optimization_barrier %0 : !stream.resource<*>
  %7 = stream.resource.size %4 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%4[%c0 to %7 for %7], %5[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %1 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %c8, @_constant__size : index
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %c8, @_constant__size : index
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %c8, @_constant__size : index
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %c8, @_constant__size : index
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.initializer.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__size : index
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__size : index
  util.global.store %c8, @_constant__size : index
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__size = util.global.load @_constant__size : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__size = util.global.load @_constant_0__size : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__size = util.global.load @_constant_1__size : index
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size : index
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %c12, @_constant_0__size : index
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.global.store %c24, @_constant_1__size : index
    util.global.store %c8, @_constant__size : index
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size = 8 : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size = 12 : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size = 24 : index
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant__size = 8 : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_0__size = 12 : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private @_constant_1__size = 24 : index
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__size = util.global.load @_constant__size : index
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__size = util.global.load @_constant_0__size : index
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__size = util.global.load @_constant_1__size : index
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%_constant__size} -> !stream.resource<*>{%_constant__size}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%_constant_0__size} -> !stream.resource<*>{%_constant_0__size}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%_constant_1__size} -> !stream.resource<*>{%_constant_1__size}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%_constant__size} -> !stream.resource<external>{%_constant__size}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%_constant__size} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
func.func private @_tensor_float() {
  %c24 = arith.constant 24 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
func.func private @_tensor_float() {
  %c24 = arith.constant 24 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ElideAsyncCopies (iree-stream-elide-async-copies) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ElideAsyncCopies (iree-stream-elide-async-copies) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c24 = arith.constant 24 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before EmplaceAllocations (iree-stream-emplace-allocations) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c24 = arith.constant 24 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before EmplaceAllocations (iree-stream-emplace-allocations) //----- //
func.func private @_tensor_float() {
  %c24 = arith.constant 24 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before EmplaceAllocations (iree-stream-emplace-allocations) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- //
func.func private @_tensor_float() {
  %c24 = arith.constant 24 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
  %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
  %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = util.optimization_barrier %1 : !stream.resource<*>
  %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
  %6 = util.optimization_barrier %5 : !stream.resource<*>
  %7 = stream.resource.size %3 : !stream.resource<*>
  %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
  %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before RefineUsage (iree-stream-refine-usage) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c24 = arith.constant 24 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<*>{%c8}
    %1 = stream.async.transfer %_constant_0 : !stream.resource<constant>{%c12} -> !stream.resource<*>{%c12}
    %2 = stream.async.transfer %_constant_1 : !stream.resource<constant>{%c24} -> !stream.resource<*>{%c24}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = util.optimization_barrier %1 : !stream.resource<*>
    %5 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<*>{%c4}
    %6 = util.optimization_barrier %5 : !stream.resource<*>
    %7 = stream.resource.size %3 : !stream.resource<*>
    %8 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%3[%c0 to %7 for %7], %4[%c0 to %c12 for %c12], %6[%c0 to %c4 for %c4]) : (!stream.resource<*>{%7}, !stream.resource<*>{%c12}, !stream.resource<*>{%c4}) -> !stream.resource<*>{%c8}
    %9 = stream.async.transfer %8 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.async.transfer %0 : !stream.resource<*>{%c8} -> !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After RefineUsage (iree-stream-refine-usage) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    util.global.store %cst, @_constant : !stream.resource<constant>
    util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
    util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    %4 = util.optimization_barrier %3 : !stream.resource<transient>
    %5 = stream.resource.size %1 : !stream.resource<constant>
    %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%7, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before ScheduleExecution (iree-stream-schedule-execution) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
  %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  util.global.store %cst, @_constant : !stream.resource<constant>
  util.global.store %cst_0, @_constant_0 : !stream.resource<constant>
  util.global.store %cst_1, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before ScheduleExecution (iree-stream-schedule-execution) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before ScheduleExecution (iree-stream-schedule-execution) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.async.transfer %_constant : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  %1 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %2 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %3 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
  %4 = util.optimization_barrier %3 : !stream.resource<transient>
  %5 = stream.resource.size %1 : !stream.resource<constant>
  %6 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%1[%c0 to %5 for %5], %2[%c0 to %c12 for %c12], %4[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%5}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
  %7 = stream.tensor.export %6 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %8 = stream.tensor.export %0 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%7, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  %0:3 = stream.timepoint.await %result_timepoint => %results#0, %results#1, %results#2 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  util.global.store %0#0, @_constant : !stream.resource<constant>
  util.global.store %0#1, @_constant_0 : !stream.resource<constant>
  util.global.store %0#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  %0:3 = stream.timepoint.await %result_timepoint => %results#0, %results#1, %results#2 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  util.global.store %0#0, @_constant : !stream.resource<constant>
  util.global.store %0#1, @_constant_0 : !stream.resource<constant>
  util.global.store %0#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %9 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %9 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %0 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %1 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %9 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %9 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %0 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%0 as %arg0: !stream.resource<constant>{%4}, %1 as %arg1: !stream.resource<constant>{%c12}, %3 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %9 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %4 for %4], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%4}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %9 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %5 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %6 = stream.tensor.export %5 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %8 = stream.tensor.export %7 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%6, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %9 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %9 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %0 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %1 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %9 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %9 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %0 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%0 as %arg0: !stream.resource<constant>{%4}, %1 as %arg1: !stream.resource<constant>{%c12}, %3 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %9 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %4 for %4], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%4}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %9 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %5 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %6 = stream.tensor.export %5 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %8 = stream.tensor.export %7 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%6, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  %0:3 = stream.timepoint.await %result_timepoint => %results#0, %results#1, %results#2 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  util.global.store %0#0, @_constant : !stream.resource<constant>
  util.global.store %0#1, @_constant_0 : !stream.resource<constant>
  util.global.store %0#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %9 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %9 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %0 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
  %1 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %9 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %9 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %0 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%0 as %arg0: !stream.resource<constant>{%4}, %1 as %arg1: !stream.resource<constant>{%c12}, %3 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %9 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %4 for %4], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%4}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %9 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %5 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %6 = stream.tensor.export %5 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %8 = stream.tensor.export %7 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%6, %8) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before PropagateTimepoints (iree-stream-propagate-timepoints) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    %0:3 = stream.timepoint.await %result_timepoint => %results#0, %results#1, %results#2 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    util.global.store %0#0, @_constant : !stream.resource<constant>
    util.global.store %0#1, @_constant_0 : !stream.resource<constant>
    util.global.store %0#2, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %9 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %9 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %0 = util.optimization_barrier %_constant_1 : !stream.resource<constant>
    %1 = util.optimization_barrier %_constant_0 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %9 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %9 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %0 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with(%0 as %arg0: !stream.resource<constant>{%4}, %1 as %arg1: !stream.resource<constant>{%c12}, %3 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %9 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %4 for %4], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%4}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %9 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %5 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %6 = stream.tensor.export %5 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %8 = stream.tensor.export %7 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%6, %8) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_1__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    %0:3 = stream.timepoint.await %result_timepoint => %results#0, %results#1, %results#2 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %0 = stream.timepoint.await %_constant__timepoint => %_constant : !stream.resource<constant>{%c8}
    %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %2 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%1}
    %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %3 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%3}
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %18 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %18 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %6 = util.optimization_barrier %2 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %18 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %18 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %7 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %8 = util.optimization_barrier %7 : !stream.resource<transient>
    %9 = stream.resource.size %5 : !stream.resource<constant>
    %10 = stream.timepoint.immediate => !stream.timepoint
    %11 = stream.timepoint.immediate => !stream.timepoint
    %12 = stream.timepoint.immediate => !stream.timepoint
    %13 = stream.timepoint.immediate => !stream.timepoint
    %results_2, %result_timepoint_3 = stream.async.execute await(%13) => with(%5 as %arg0: !stream.resource<constant>{%9}, %6 as %arg1: !stream.resource<constant>{%c12}, %8 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %18 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %9 for %9], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%9}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %18 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %14 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %15 = stream.tensor.export %14 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %16 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %17 = stream.tensor.export %16 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%15, %17) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  %0:3 = stream.timepoint.await %result_timepoint => %results#0, %results#1, %results#2 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %0 = stream.timepoint.await %_constant__timepoint => %_constant : !stream.resource<constant>{%c8}
  %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %2 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%1}
  %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %3 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%3}
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %18 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %18 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = util.optimization_barrier %2 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %18 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %18 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %7 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %8 = util.optimization_barrier %7 : !stream.resource<transient>
  %9 = stream.resource.size %5 : !stream.resource<constant>
  %10 = stream.timepoint.immediate => !stream.timepoint
  %11 = stream.timepoint.immediate => !stream.timepoint
  %12 = stream.timepoint.immediate => !stream.timepoint
  %13 = stream.timepoint.immediate => !stream.timepoint
  %results_2, %result_timepoint_3 = stream.async.execute await(%13) => with(%5 as %arg0: !stream.resource<constant>{%9}, %6 as %arg1: !stream.resource<constant>{%c12}, %8 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %18 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %9 for %9], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%9}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %18 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %14 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %15 = stream.tensor.export %14 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %16 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %17 = stream.tensor.export %16 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%15, %17) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
  %3 = util.optimization_barrier %2 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %13 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %7 = util.optimization_barrier %6 : !stream.resource<transient>
  %8 = stream.resource.size %3 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
  %3 = util.optimization_barrier %2 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %13 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %7 = util.optimization_barrier %6 : !stream.resource<transient>
  %8 = stream.resource.size %3 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
  %3 = util.optimization_barrier %2 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %13 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %7 = util.optimization_barrier %6 : !stream.resource<transient>
  %8 = stream.resource.size %3 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
  %3 = util.optimization_barrier %2 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %13 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %7 = util.optimization_barrier %6 : !stream.resource<transient>
  %8 = stream.resource.size %3 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
  %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %c4 = arith.constant 4 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c12 = arith.constant 12 : index
  %c8 = arith.constant 8 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
  %3 = util.optimization_barrier %2 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %13 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
  %7 = util.optimization_barrier %6 : !stream.resource<transient>
  %8 = stream.resource.size %3 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_1__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
    %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_1__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
    %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_1__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
    %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_1__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
    %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_1__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_0__timepoint : !stream.timepoint
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant_1__timepoint : !stream.timepoint
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant_1__timepoint = util.global.load @_constant_1__timepoint : !stream.timepoint
    %_constant_0__timepoint = util.global.load @_constant_0__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant_1__timepoint => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant_0__timepoint => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_0, %result_timepoint_1 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_4, %result_timepoint_5 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_5 => %results_4 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_4, %result_timepoint_5 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_5 => %results_4 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_4, %result_timepoint_5 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_5 => %results_4 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_4, %result_timepoint_5 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_5 => %results_4 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) //----- //
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
      %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
      %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
      %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
      stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
    } => !stream.timepoint
    util.global.store %results#0, @_constant : !stream.resource<constant>
    util.global.store %results#1, @_constant_0 : !stream.resource<constant>
    util.global.store %results#2, @_constant_1 : !stream.resource<constant>
    util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
    %3 = util.optimization_barrier %2 : !stream.resource<constant>
    %4 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %results_2, %result_timepoint_3 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
      %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
      stream.yield %13 : !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<transient>{%c4}
    %7 = util.optimization_barrier %6 : !stream.resource<transient>
    %8 = stream.resource.size %3 : !stream.resource<constant>
    %results_4, %result_timepoint_5 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
      %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
      stream.yield %13 : !stream.resource<external>{%c8}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_5 => %results_4 : !stream.resource<external>{%c8}
    %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
    %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%10, %12) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before ScheduleAllocation (iree-stream-schedule-allocation) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before ScheduleAllocation (iree-stream-schedule-allocation) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %results, %result_timepoint = stream.async.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.transfer %arg0 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %3 = util.optimization_barrier %2 : !stream.resource<constant>
  %4 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %results_2, %result_timepoint_3 = stream.async.execute with() -> !stream.resource<transient>{%c4} {
    %13 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c4}
    stream.yield %13 : !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<transient>{%c4}
  %7 = util.optimization_barrier %6 : !stream.resource<transient>
  %8 = stream.resource.size %3 : !stream.resource<constant>
  %results_4, %result_timepoint_5 = stream.async.execute with(%3 as %arg0: !stream.resource<constant>{%8}, %5 as %arg1: !stream.resource<constant>{%c12}, %7 as %arg2: !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8} {
    %13 = stream.async.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1](%arg0[%c0 to %8 for %8], %arg1[%c0 to %c12 for %c12], %arg2[%c0 to %c4 for %c4]) : (!stream.resource<constant>{%8}, !stream.resource<constant>{%c12}, !stream.resource<transient>{%c4}) -> !stream.resource<external>{%c8}
    stream.yield %13 : !stream.resource<external>{%c8}
  } => !stream.timepoint
  %9 = stream.timepoint.await %result_timepoint_5 => %results_4 : !stream.resource<external>{%c8}
  %10 = stream.tensor.export %9 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %11 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c8}
  %12 = stream.tensor.export %11 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%10, %12) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ScheduleAllocation (iree-stream-schedule-allocation) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.async.execute with() -> (!stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}) {
    %cst = stream.async.constant : !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>
    %cst_0 = stream.async.constant : !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
    %cst_1 = stream.async.constant : !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    stream.yield %cst, %cst_0, %cst_1 : !stream.resource<constant>{%c8}, !stream.resource<constant>{%c12}, !stream.resource<constant>{%c24}
  } => !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.global.store %result_timepoint, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before PackConstants (iree-stream-pack-constants) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before PackConstants (iree-stream-pack-constants) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before PackAllocations (iree-stream-pack-allocations) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.resource.constants :
    !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    => !stream.timepoint
  %0 = stream.cmd.execute with() {
  } => !stream.timepoint
  %1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.global.store %1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump Before PackConstants (iree-stream-pack-constants) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %results:3, %result_timepoint = stream.resource.constants :
    !stream.resource<constant>{%c8} = dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    !stream.resource<constant>{%c12} = dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    !stream.resource<constant>{%c24} = dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
    => !stream.timepoint
  %0 = stream.cmd.execute with() {
  } => !stream.timepoint
  %1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint
  util.global.store %results#0, @_constant : !stream.resource<constant>
  util.global.store %results#1, @_constant_0 : !stream.resource<constant>
  util.global.store %results#2, @_constant_1 : !stream.resource<constant>
  util.global.store %1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before PackAllocations (iree-stream-pack-allocations) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %c0 = arith.constant 0 : index
  %c192 = arith.constant 192 : index
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    %6 = stream.timepoint.immediate => !stream.timepoint
    scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
  } else {
    %6 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
  }
  %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
  %c64 = arith.constant 64 : index
  %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
  %c128 = arith.constant 128 : index
  %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
  %4 = stream.cmd.execute with() {
  } => !stream.timepoint
  %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
  util.global.store %1, @_constant : !stream.resource<constant>
  util.global.store %2, @_constant_0 : !stream.resource<constant>
  util.global.store %3, @_constant_1 : !stream.resource<constant>
  util.global.store %5, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump Before PackAllocations (iree-stream-pack-allocations) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %c0 = arith.constant 0 : index
  %c192 = arith.constant 192 : index
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    %6 = stream.timepoint.immediate => !stream.timepoint
    scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
  } else {
    %6 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
  }
  %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
  %c64 = arith.constant 64 : index
  %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
  %c128 = arith.constant 128 : index
  %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
  %4 = stream.cmd.execute with() {
  } => !stream.timepoint
  %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
  util.global.store %1, @_constant : !stream.resource<constant>
  util.global.store %2, @_constant_0 : !stream.resource<constant>
  util.global.store %3, @_constant_1 : !stream.resource<constant>
  util.global.store %5, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before LayoutSlices (iree-stream-layout-slices) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %c0 = arith.constant 0 : index
  %c192 = arith.constant 192 : index
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    %6 = stream.timepoint.immediate => !stream.timepoint
    scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
  } else {
    %6 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
  }
  %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
  %c64 = arith.constant 64 : index
  %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
  %c128 = arith.constant 128 : index
  %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
  %4 = stream.cmd.execute with() {
  } => !stream.timepoint
  %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
  util.global.store %1, @_constant : !stream.resource<constant>
  util.global.store %2, @_constant_0 : !stream.resource<constant>
  util.global.store %3, @_constant_1 : !stream.resource<constant>
  util.global.store %5, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump Before LayoutSlices (iree-stream-layout-slices) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %c0 = arith.constant 0 : index
  %c192 = arith.constant 192 : index
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    %6 = stream.timepoint.immediate => !stream.timepoint
    scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
  } else {
    %6 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
  }
  %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
  %c64 = arith.constant 64 : index
  %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
  %c128 = arith.constant 128 : index
  %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
  %4 = stream.cmd.execute with() {
  } => !stream.timepoint
  %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
  util.global.store %1, @_constant : !stream.resource<constant>
  util.global.store %2, @_constant_0 : !stream.resource<constant>
  util.global.store %3, @_constant_1 : !stream.resource<constant>
  util.global.store %5, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
  %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
  %5 = util.optimization_barrier %4 : !stream.resource<constant>
  %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %5 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before LayoutSlices (iree-stream-layout-slices) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- //
util.initializer {
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %c0 = arith.constant 0 : index
  %c192 = arith.constant 192 : index
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    %6 = stream.timepoint.immediate => !stream.timepoint
    scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
  } else {
    %6 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
  }
  %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
  %c64 = arith.constant 64 : index
  %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
  %c128 = arith.constant 128 : index
  %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
  %4 = stream.cmd.execute with() {
  } => !stream.timepoint
  %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
  util.global.store %1, @_constant : !stream.resource<constant>
  util.global.store %2, @_constant_0 : !stream.resource<constant>
  util.global.store %3, @_constant_1 : !stream.resource<constant>
  util.global.store %5, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before PropagateSubranges (iree-util-propagate-subranges) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %c0 = arith.constant 0 : index
    %c192 = arith.constant 192 : index
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      %6 = stream.timepoint.immediate => !stream.timepoint
      scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
    } else {
      %6 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
    }
    %1 = stream.resource.subview %0#0[%c0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
    %c64 = arith.constant 64 : index
    %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
    %c128 = arith.constant 128 : index
    %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
    %4 = stream.cmd.execute with() {
    } => !stream.timepoint
    %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
    util.global.store %1, @_constant : !stream.resource<constant>
    util.global.store %2, @_constant_0 : !stream.resource<constant>
    util.global.store %3, @_constant_1 : !stream.resource<constant>
    util.global.store %5, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.size %_constant_0 : !stream.resource<constant>
    %1 = stream.resource.size %_constant_1 : !stream.resource<constant>
    %c0_2 = arith.constant 0 : index
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %3 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c8}, %2 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %4 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%1}
    %5 = util.optimization_barrier %4 : !stream.resource<constant>
    %6 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%0}
    %7 = util.optimization_barrier %6 : !stream.resource<constant>
    %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %c0_3 = arith.constant 0 : index
    %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
    %11 = util.optimization_barrier %10 : !stream.resource<transient>
    %12 = stream.resource.size %5 : !stream.resource<constant>
    %c0_4 = arith.constant 0 : index
    %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %14 = stream.cmd.execute with(%5 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %17 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c8}
    %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%16, %18) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant__storage_size : index
  util.global private mutable @_constant__offset : index
  util.global private mutable @_constant__length : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_0__storage_size : index
  util.global private mutable @_constant_0__offset : index
  util.global private mutable @_constant_0__length : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private mutable @_constant_1__storage_size : index
  util.global private mutable @_constant_1__offset : index
  util.global private mutable @_constant_1__length : index
  util.initializer {
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %c0_0 = arith.constant 0 : index
    %c192 = arith.constant 192 : index
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0_0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      %6 = stream.timepoint.immediate => !stream.timepoint
      scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
    } else {
      %6 = stream.resource.map %buffer_cst[%c0_0] : !util.buffer -> !stream.resource<staging>{%c192}
      %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0_0], %arg1[%c0_0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
    }
    %1 = stream.resource.subview %0#0[%c0_0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
    %c64 = arith.constant 64 : index
    %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
    %c128 = arith.constant 128 : index
    %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
    %4 = stream.cmd.execute with() {
    } => !stream.timepoint
    %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
    util.global.store %0#0, @_constant : !stream.resource<constant>
    util.global.store %c192, @_constant__storage_size : index
    util.global.store %c0_0, @_constant__offset : index
    util.global.store %c8, @_constant__length : index
    util.global.store %0#0, @_constant_0 : !stream.resource<constant>
    util.global.store %c192, @_constant_0__storage_size : index
    util.global.store %c64, @_constant_0__offset : index
    util.global.store %c12, @_constant_0__length : index
    util.global.store %0#0, @_constant_1 : !stream.resource<constant>
    util.global.store %c192, @_constant_1__storage_size : index
    util.global.store %c128, @_constant_1__offset : index
    util.global.store %c24, @_constant_1__length : index
    util.global.store %5, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant__storage_size = util.global.load @_constant__storage_size : index
    %_constant__offset = util.global.load @_constant__offset : index
    %_constant__length = util.global.load @_constant__length : index
    %0 = stream.resource.subview %_constant[%_constant__offset] : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<constant>{%_constant__length}
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
    %_constant_0__offset = util.global.load @_constant_0__offset : index
    %_constant_0__length = util.global.load @_constant_0__length : index
    %1 = stream.resource.subview %_constant_0[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
    %_constant_1__offset = util.global.load @_constant_1__offset : index
    %_constant_1__length = util.global.load @_constant_1__length : index
    %2 = stream.resource.subview %_constant_1[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
    %3 = stream.resource.size %1 : !stream.resource<constant>
    %4 = stream.resource.size %2 : !stream.resource<constant>
    %c0_2 = arith.constant 0 : index
    %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %6 = stream.cmd.execute await(%_constant__timepoint) => with(%0 as %arg0: !stream.resource<constant>{%c8}, %5 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %7 = stream.timepoint.await %_constant__timepoint_0 => %2 : !stream.resource<constant>{%4}
    %8 = util.optimization_barrier %7 : !stream.resource<constant>
    %9 = stream.timepoint.await %_constant__timepoint_1 => %1 : !stream.resource<constant>{%3}
    %10 = util.optimization_barrier %9 : !stream.resource<constant>
    %11 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %c0_3 = arith.constant 0 : index
    %12 = stream.cmd.execute with(%11 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %13 = stream.timepoint.await %12 => %11 : !stream.resource<transient>{%c4}
    %14 = util.optimization_barrier %13 : !stream.resource<transient>
    %15 = stream.resource.size %8 : !stream.resource<constant>
    %c0_4 = arith.constant 0 : index
    %16 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %17 = stream.cmd.execute with(%8 as %arg0: !stream.resource<constant>{%15}, %10 as %arg1: !stream.resource<constant>{%c12}, %14 as %arg2: !stream.resource<transient>{%c4}, %16 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %15] : !stream.resource<constant>{%15},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %18 = stream.timepoint.await %17 => %16 : !stream.resource<external>{%c8}
    %19 = stream.tensor.export %18 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %20 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c8}
    %21 = stream.tensor.export %20 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%19, %21) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.initializer {
  %c0 = arith.constant 0 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %c0_0 = arith.constant 0 : index
  %c192 = arith.constant 192 : index
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0_0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %0:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    %6 = stream.timepoint.immediate => !stream.timepoint
    scf.yield %result, %6 : !stream.resource<constant>, !stream.timepoint
  } else {
    %6 = stream.resource.map %buffer_cst[%c0_0] : !util.buffer -> !stream.resource<staging>{%c192}
    %7 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %8 = stream.cmd.execute with(%6 as %arg0: !stream.resource<staging>{%c192}, %7 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0_0], %arg1[%c0_0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %7, %8 : !stream.resource<constant>, !stream.timepoint
  }
  %1 = stream.resource.subview %0#0[%c0_0] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
  %c64 = arith.constant 64 : index
  %2 = stream.resource.subview %0#0[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
  %c128 = arith.constant 128 : index
  %3 = stream.resource.subview %0#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
  %4 = stream.cmd.execute with() {
  } => !stream.timepoint
  %5 = stream.timepoint.join max(%0#1, %4) => !stream.timepoint
  util.global.store %0#0, @_constant : !stream.resource<constant>
  util.global.store %c192, @_constant__storage_size : index
  util.global.store %c0_0, @_constant__offset : index
  util.global.store %c8, @_constant__length : index
  util.global.store %0#0, @_constant_0 : !stream.resource<constant>
  util.global.store %c192, @_constant_0__storage_size : index
  util.global.store %c64, @_constant_0__offset : index
  util.global.store %c12, @_constant_0__length : index
  util.global.store %0#0, @_constant_1 : !stream.resource<constant>
  util.global.store %c192, @_constant_1__storage_size : index
  util.global.store %c128, @_constant_1__offset : index
  util.global.store %c24, @_constant_1__length : index
  util.global.store %5, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__storage_size = util.global.load @_constant__storage_size : index
  %_constant__offset = util.global.load @_constant__offset : index
  %_constant__length = util.global.load @_constant__length : index
  %0 = stream.resource.subview %_constant[%_constant__offset] : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<constant>{%_constant__length}
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
  %_constant_0__offset = util.global.load @_constant_0__offset : index
  %_constant_0__length = util.global.load @_constant_0__length : index
  %1 = stream.resource.subview %_constant_0[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
  %_constant_1__offset = util.global.load @_constant_1__offset : index
  %_constant_1__length = util.global.load @_constant_1__length : index
  %2 = stream.resource.subview %_constant_1[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
  %3 = stream.resource.size %1 : !stream.resource<constant>
  %4 = stream.resource.size %2 : !stream.resource<constant>
  %c0_2 = arith.constant 0 : index
  %5 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %6 = stream.cmd.execute await(%_constant__timepoint) => with(%0 as %arg0: !stream.resource<constant>{%c8}, %5 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%c0_2], %arg1[%c0_2], %c8 : !stream.resource<constant>{%c8} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %7 = stream.timepoint.await %_constant__timepoint_0 => %2 : !stream.resource<constant>{%4}
  %8 = util.optimization_barrier %7 : !stream.resource<constant>
  %9 = stream.timepoint.await %_constant__timepoint_1 => %1 : !stream.resource<constant>{%3}
  %10 = util.optimization_barrier %9 : !stream.resource<constant>
  %11 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %c0_3 = arith.constant 0 : index
  %12 = stream.cmd.execute with(%11 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_3 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %13 = stream.timepoint.await %12 => %11 : !stream.resource<transient>{%c4}
  %14 = util.optimization_barrier %13 : !stream.resource<transient>
  %15 = stream.resource.size %8 : !stream.resource<constant>
  %c0_4 = arith.constant 0 : index
  %16 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %17 = stream.cmd.execute with(%8 as %arg0: !stream.resource<constant>{%15}, %10 as %arg1: !stream.resource<constant>{%c12}, %14 as %arg2: !stream.resource<transient>{%c4}, %16 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %15] : !stream.resource<constant>{%15},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0_4 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %18 = stream.timepoint.await %17 => %16 : !stream.resource<external>{%c8}
  %19 = stream.tensor.export %18 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %20 = stream.timepoint.await %6 => %5 : !stream.resource<external>{%c8}
  %21 = stream.tensor.export %20 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%19, %21) : tensor<2x1xf32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.initializer {
  %0 = stream.timepoint.immediate => !stream.timepoint
  %c128 = arith.constant 128 : index
  %c64 = arith.constant 64 : index
  %c192 = arith.constant 192 : index
  %c0 = arith.constant 0 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
  } else {
    %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
  }
  util.global.store %1#0, @_constant : !stream.resource<constant>
  util.global.store %c192, @_constant__storage_size : index
  util.global.store %c0, @_constant__offset : index
  util.global.store %c8, @_constant__length : index
  util.global.store %1#0, @_constant_0 : !stream.resource<constant>
  util.global.store %c192, @_constant_0__storage_size : index
  util.global.store %c64, @_constant_0__offset : index
  util.global.store %c12, @_constant_0__length : index
  util.global.store %1#0, @_constant_1 : !stream.resource<constant>
  util.global.store %c192, @_constant_1__storage_size : index
  util.global.store %c128, @_constant_1__offset : index
  util.global.store %c24, @_constant_1__length : index
  util.global.store %1#1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump Before CSE (cse) //----- //
util.initializer {
  %0 = stream.timepoint.immediate => !stream.timepoint
  %c128 = arith.constant 128 : index
  %c64 = arith.constant 64 : index
  %c192 = arith.constant 192 : index
  %c0 = arith.constant 0 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
  } else {
    %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
  }
  util.global.store %1#0, @_constant : !stream.resource<constant>
  util.global.store %c192, @_constant__storage_size : index
  util.global.store %c0, @_constant__offset : index
  util.global.store %c8, @_constant__length : index
  util.global.store %1#0, @_constant_0 : !stream.resource<constant>
  util.global.store %c192, @_constant_0__storage_size : index
  util.global.store %c64, @_constant_0__offset : index
  util.global.store %c12, @_constant_0__length : index
  util.global.store %1#0, @_constant_1 : !stream.resource<constant>
  util.global.store %c192, @_constant_1__storage_size : index
  util.global.store %c128, @_constant_1__offset : index
  util.global.store %c24, @_constant_1__length : index
  util.global.store %1#1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After CSE (cse) //----- //
util.initializer {
  %0 = stream.timepoint.immediate => !stream.timepoint
  %c128 = arith.constant 128 : index
  %c64 = arith.constant 64 : index
  %c192 = arith.constant 192 : index
  %c0 = arith.constant 0 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
  } else {
    %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
  }
  util.global.store %1#0, @_constant : !stream.resource<constant>
  util.global.store %c192, @_constant__storage_size : index
  util.global.store %c0, @_constant__offset : index
  util.global.store %c8, @_constant__length : index
  util.global.store %1#0, @_constant_0 : !stream.resource<constant>
  util.global.store %c192, @_constant_0__storage_size : index
  util.global.store %c64, @_constant_0__offset : index
  util.global.store %c12, @_constant_0__length : index
  util.global.store %1#0, @_constant_1 : !stream.resource<constant>
  util.global.store %c192, @_constant_1__storage_size : index
  util.global.store %c128, @_constant_1__offset : index
  util.global.store %c24, @_constant_1__length : index
  util.global.store %1#1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = stream.timepoint.immediate => !stream.timepoint
  %c128 = arith.constant 128 : index
  %c64 = arith.constant 64 : index
  %c192 = arith.constant 192 : index
  %c0 = arith.constant 0 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
  } else {
    %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
  }
  util.global.store %1#0, @_constant : !stream.resource<constant>
  util.global.store %c192, @_constant__storage_size : index
  util.global.store %c0, @_constant__offset : index
  util.global.store %c8, @_constant__length : index
  util.global.store %1#0, @_constant_0 : !stream.resource<constant>
  util.global.store %c192, @_constant_0__storage_size : index
  util.global.store %c64, @_constant_0__offset : index
  util.global.store %c12, @_constant_0__length : index
  util.global.store %1#0, @_constant_1 : !stream.resource<constant>
  util.global.store %c192, @_constant_1__storage_size : index
  util.global.store %c128, @_constant_1__offset : index
  util.global.store %c24, @_constant_1__length : index
  util.global.store %1#1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @tensor_float() attributes {iree.abi.stub} {
  call @_tensor_float() : () -> ()
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__storage_size = util.global.load @_constant__storage_size : index
  %_constant__offset = util.global.load @_constant__offset : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
  %_constant_0__offset = util.global.load @_constant_0__offset : index
  %_constant_0__length = util.global.load @_constant_0__length : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
  %_constant_1__offset = util.global.load @_constant_1__offset : index
  %_constant_1__length = util.global.load @_constant_1__length : index
  %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%_constant_1__storage_size}
  %3 = stream.resource.subview %2[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
  %4 = util.optimization_barrier %3 : !stream.resource<constant>
  %5 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%_constant_0__storage_size}
  %6 = stream.resource.subview %5[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %4 : !stream.resource<constant>
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_0 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__timepoint_1 = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__storage_size = util.global.load @_constant__storage_size : index
  %_constant__offset = util.global.load @_constant__offset : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
  %_constant_0__offset = util.global.load @_constant_0__offset : index
  %_constant_0__length = util.global.load @_constant_0__length : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
  %_constant_1__offset = util.global.load @_constant_1__offset : index
  %_constant_1__length = util.global.load @_constant_1__length : index
  %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant__timepoint_0 => %_constant_1 : !stream.resource<constant>{%_constant_1__storage_size}
  %3 = stream.resource.subview %2[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
  %4 = util.optimization_barrier %3 : !stream.resource<constant>
  %5 = stream.timepoint.await %_constant__timepoint_1 => %_constant_0 : !stream.resource<constant>{%_constant_0__storage_size}
  %6 = stream.resource.subview %5[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %4 : !stream.resource<constant>
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
util.initializer {
  %0 = stream.timepoint.immediate => !stream.timepoint
  %c128 = arith.constant 128 : index
  %c64 = arith.constant 64 : index
  %c192 = arith.constant 192 : index
  %c0 = arith.constant 0 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c24 = arith.constant 24 : index
  %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
  %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
  %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
    scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
  } else {
    %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
    %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
    %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
    } => !stream.timepoint
    scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
  }
  util.global.store %1#0, @_constant : !stream.resource<constant>
  util.global.store %1#0, @_constant_0 : !stream.resource<constant>
  util.global.store %c12, @_constant_0__length : index
  util.global.store %c64, @_constant_0__offset : index
  util.global.store %c192, @_constant_0__storage_size : index
  util.global.store %1#0, @_constant_1 : !stream.resource<constant>
  util.global.store %c24, @_constant_1__length : index
  util.global.store %c128, @_constant_1__offset : index
  util.global.store %c192, @_constant_1__storage_size : index
  util.global.store %c8, @_constant__length : index
  util.global.store %c0, @_constant__offset : index
  util.global.store %c192, @_constant__storage_size : index
  util.global.store %1#1, @_constant__timepoint : !stream.timepoint
  util.initializer.return
}

// -----// IR Dump After CSE (cse) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__storage_size = util.global.load @_constant__storage_size : index
  %_constant__offset = util.global.load @_constant__offset : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
  %_constant_0__offset = util.global.load @_constant_0__offset : index
  %_constant_0__length = util.global.load @_constant_0__length : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
  %_constant_1__offset = util.global.load @_constant_1__offset : index
  %_constant_1__length = util.global.load @_constant_1__length : index
  %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant__timepoint => %_constant_1 : !stream.resource<constant>{%_constant_1__storage_size}
  %3 = stream.resource.subview %2[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
  %4 = util.optimization_barrier %3 : !stream.resource<constant>
  %5 = stream.timepoint.await %_constant__timepoint => %_constant_0 : !stream.resource<constant>{%_constant_0__storage_size}
  %6 = stream.resource.subview %5[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %4 : !stream.resource<constant>
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant__storage_size = util.global.load @_constant__storage_size : index
  %_constant__offset = util.global.load @_constant__offset : index
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
  %_constant_0__offset = util.global.load @_constant_0__offset : index
  %_constant_0__length = util.global.load @_constant_0__length : index
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
  %_constant_1__offset = util.global.load @_constant_1__offset : index
  %_constant_1__length = util.global.load @_constant_1__length : index
  %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant__timepoint => %_constant_1 : !stream.resource<constant>{%_constant_1__storage_size}
  %3 = stream.resource.subview %2[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
  %4 = util.optimization_barrier %3 : !stream.resource<constant>
  %5 = stream.timepoint.await %_constant__timepoint => %_constant_0 : !stream.resource<constant>{%_constant_0__storage_size}
  %6 = stream.resource.subview %5[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %4 : !stream.resource<constant>
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_tensor_float() {
  %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
  %_constant__storage_size = util.global.load @_constant__storage_size : index
  %_constant__offset = util.global.load @_constant__offset : index
  %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
  %_constant_1__offset = util.global.load @_constant_1__offset : index
  %_constant_1__length = util.global.load @_constant_1__length : index
  %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
  %_constant_0__offset = util.global.load @_constant_0__offset : index
  %_constant_0__length = util.global.load @_constant_0__length : index
  %_constant = util.global.load @_constant : !stream.resource<constant>
  %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
  %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c12 = arith.constant 12 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c4 = arith.constant 4 : index
  %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
    stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
  } => !stream.timepoint
  %2 = stream.timepoint.await %_constant__timepoint => %_constant_1 : !stream.resource<constant>{%_constant_1__storage_size}
  %3 = stream.resource.subview %2[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
  %4 = util.optimization_barrier %3 : !stream.resource<constant>
  %5 = stream.timepoint.await %_constant__timepoint => %_constant_0 : !stream.resource<constant>{%_constant_0__storage_size}
  %6 = stream.resource.subview %5[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
  %7 = util.optimization_barrier %6 : !stream.resource<constant>
  %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
  %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
  } => !stream.timepoint
  %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
  %11 = util.optimization_barrier %10 : !stream.resource<transient>
  %12 = stream.resource.size %4 : !stream.resource<constant>
  %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
  %14 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
    stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
      ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
      ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
      ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
      wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
    }
  } => !stream.timepoint
  %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
  %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  %17 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
  %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
  check.expect_eq(%16, %18) : tensor<2x1xf32>
  return
}

// -----// IR Dump Before ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant__storage_size : index
  util.global private mutable @_constant__offset : index
  util.global private mutable @_constant__length : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_0__storage_size : index
  util.global private mutable @_constant_0__offset : index
  util.global private mutable @_constant_0__length : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private mutable @_constant_1__storage_size : index
  util.global private mutable @_constant_1__offset : index
  util.global private mutable @_constant_1__length : index
  util.initializer {
    %0 = stream.timepoint.immediate => !stream.timepoint
    %c128 = arith.constant 128 : index
    %c64 = arith.constant 64 : index
    %c192 = arith.constant 192 : index
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c24 = arith.constant 24 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
    } else {
      %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
    }
    util.global.store %1#0, @_constant : !stream.resource<constant>
    util.global.store %1#0, @_constant_0 : !stream.resource<constant>
    util.global.store %c12, @_constant_0__length : index
    util.global.store %c64, @_constant_0__offset : index
    util.global.store %c192, @_constant_0__storage_size : index
    util.global.store %1#0, @_constant_1 : !stream.resource<constant>
    util.global.store %c24, @_constant_1__length : index
    util.global.store %c128, @_constant_1__offset : index
    util.global.store %c192, @_constant_1__storage_size : index
    util.global.store %c8, @_constant__length : index
    util.global.store %c0, @_constant__offset : index
    util.global.store %c192, @_constant__storage_size : index
    util.global.store %1#1, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__storage_size = util.global.load @_constant__storage_size : index
    %_constant__offset = util.global.load @_constant__offset : index
    %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
    %_constant_1__offset = util.global.load @_constant_1__offset : index
    %_constant_1__length = util.global.load @_constant_1__length : index
    %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
    %_constant_0__offset = util.global.load @_constant_0__offset : index
    %_constant_0__length = util.global.load @_constant_0__length : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c12 = arith.constant 12 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c4 = arith.constant 4 : index
    %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2 = stream.timepoint.await %_constant__timepoint => %_constant_1 : !stream.resource<constant>{%_constant_1__storage_size}
    %3 = stream.resource.subview %2[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
    %4 = util.optimization_barrier %3 : !stream.resource<constant>
    %5 = stream.timepoint.await %_constant__timepoint => %_constant_0 : !stream.resource<constant>{%_constant_0__storage_size}
    %6 = stream.resource.subview %5[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
    %7 = util.optimization_barrier %6 : !stream.resource<constant>
    %8 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %9 = stream.cmd.execute with(%8 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %10 = stream.timepoint.await %9 => %8 : !stream.resource<transient>{%c4}
    %11 = util.optimization_barrier %10 : !stream.resource<transient>
    %12 = stream.resource.size %4 : !stream.resource<constant>
    %13 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %14 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%12}, %7 as %arg1: !stream.resource<constant>{%c12}, %11 as %arg2: !stream.resource<transient>{%c4}, %13 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %12] : !stream.resource<constant>{%12},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %15 = stream.timepoint.await %14 => %13 : !stream.resource<external>{%c8}
    %16 = stream.tensor.export %15 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %17 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
    %18 = stream.tensor.export %17 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%16, %18) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant__storage_size = 192 : index
  util.global private mutable @_constant__offset = 0 : index
  util.global private mutable @_constant__length = 8 : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_0__storage_size = 192 : index
  util.global private mutable @_constant_0__offset = 64 : index
  util.global private mutable @_constant_0__length = 12 : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private mutable @_constant_1__storage_size = 192 : index
  util.global private mutable @_constant_1__offset = 128 : index
  util.global private mutable @_constant_1__length = 24 : index
  util.initializer {
    %0 = stream.timepoint.immediate => !stream.timepoint
    %c192 = arith.constant 192 : index
    %c0 = arith.constant 0 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
    } else {
      %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
    }
    util.global.store %1#0, @_constant : !stream.resource<constant>
    util.global.store %1#0, @_constant_0 : !stream.resource<constant>
    util.global.store %1#0, @_constant_1 : !stream.resource<constant>
    util.global.store %1#1, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__storage_size = util.global.load @_constant__storage_size : index
    %_constant__offset = util.global.load @_constant__offset : index
    %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
    %_constant_1__offset = util.global.load @_constant_1__offset : index
    %_constant_1__length = util.global.load @_constant_1__length : index
    %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
    %_constant_0__offset = util.global.load @_constant_0__offset : index
    %_constant_0__length = util.global.load @_constant_0__length : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2:2 = stream.timepoint.await %_constant__timepoint => %_constant_1, %_constant_0 : !stream.resource<constant>{%_constant_1__storage_size}, !stream.resource<constant>{%_constant_0__storage_size}
    %3 = stream.resource.subview %2#0[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
    %4 = util.optimization_barrier %3 : !stream.resource<constant>
    %5 = stream.resource.subview %2#1[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
    %6 = util.optimization_barrier %5 : !stream.resource<constant>
    %7 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %8 = stream.cmd.execute with(%7 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %7 : !stream.resource<transient>{%c4}
    %10 = util.optimization_barrier %9 : !stream.resource<transient>
    %11 = stream.resource.size %4 : !stream.resource<constant>
    %12 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %13 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%11}, %6 as %arg1: !stream.resource<constant>{%c12}, %10 as %arg2: !stream.resource<transient>{%c4}, %12 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %11] : !stream.resource<constant>{%11},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %14 = stream.timepoint.await %13 => %12 : !stream.resource<external>{%c8}
    %15 = stream.tensor.export %14 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %16 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
    %17 = stream.tensor.export %16 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%15, %17) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private mutable @_constant__storage_size = 192 : index
  util.global private mutable @_constant__offset = 0 : index
  util.global private mutable @_constant__length = 8 : index
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private mutable @_constant_0__storage_size = 192 : index
  util.global private mutable @_constant_0__offset = 64 : index
  util.global private mutable @_constant_0__length = 12 : index
  util.global private @_constant_1 : !stream.resource<constant>
  util.global private mutable @_constant_1__storage_size = 192 : index
  util.global private mutable @_constant_1__offset = 128 : index
  util.global private mutable @_constant_1__length = 24 : index
  util.initializer {
    %0 = stream.timepoint.immediate => !stream.timepoint
    %c192 = arith.constant 192 : index
    %c0 = arith.constant 0 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
    } else {
      %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
    }
    util.global.store %1#0, @_constant : !stream.resource<constant>
    util.global.store %1#0, @_constant_0 : !stream.resource<constant>
    util.global.store %1#0, @_constant_1 : !stream.resource<constant>
    util.global.store %1#1, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant__storage_size = util.global.load @_constant__storage_size : index
    %_constant__offset = util.global.load @_constant__offset : index
    %_constant_1__storage_size = util.global.load @_constant_1__storage_size : index
    %_constant_1__offset = util.global.load @_constant_1__offset : index
    %_constant_1__length = util.global.load @_constant_1__length : index
    %_constant_0__storage_size = util.global.load @_constant_0__storage_size : index
    %_constant_0__offset = util.global.load @_constant_0__offset : index
    %_constant_0__length = util.global.load @_constant_0__length : index
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%_constant__storage_size}, %0 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%_constant__offset], %arg1[%c0], %c8 : !stream.resource<constant>{%_constant__storage_size} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2:2 = stream.timepoint.await %_constant__timepoint => %_constant_1, %_constant_0 : !stream.resource<constant>{%_constant_1__storage_size}, !stream.resource<constant>{%_constant_0__storage_size}
    %3 = stream.resource.subview %2#0[%_constant_1__offset] : !stream.resource<constant>{%_constant_1__storage_size} -> !stream.resource<constant>{%_constant_1__length}
    %4 = util.optimization_barrier %3 : !stream.resource<constant>
    %5 = stream.resource.subview %2#1[%_constant_0__offset] : !stream.resource<constant>{%_constant_0__storage_size} -> !stream.resource<constant>{%_constant_0__length}
    %6 = util.optimization_barrier %5 : !stream.resource<constant>
    %7 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %8 = stream.cmd.execute with(%7 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %7 : !stream.resource<transient>{%c4}
    %10 = util.optimization_barrier %9 : !stream.resource<transient>
    %11 = stream.resource.size %4 : !stream.resource<constant>
    %12 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %13 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%11}, %6 as %arg1: !stream.resource<constant>{%c12}, %10 as %arg2: !stream.resource<transient>{%c4}, %12 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %11] : !stream.resource<constant>{%11},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %14 = stream.timepoint.await %13 => %12 : !stream.resource<external>{%c8}
    %15 = stream.tensor.export %14 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %16 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
    %17 = stream.tensor.export %16 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%15, %17) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %0 = stream.timepoint.immediate => !stream.timepoint
    %c192 = arith.constant 192 : index
    %c0 = arith.constant 0 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
    } else {
      %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
    }
    util.global.store %1#0, @_constant : !stream.resource<constant>
    util.global.store %1#0, @_constant_0 : !stream.resource<constant>
    util.global.store %1#0, @_constant_1 : !stream.resource<constant>
    util.global.store %1#1, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c64 = arith.constant 64 : index
    %c24 = arith.constant 24 : index
    %c128 = arith.constant 128 : index
    %c192 = arith.constant 192 : index
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c192}, %0 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c8 : !stream.resource<constant>{%c192} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2:2 = stream.timepoint.await %_constant__timepoint => %_constant_1, %_constant_0 : !stream.resource<constant>{%c192}, !stream.resource<constant>{%c192}
    %3 = stream.resource.subview %2#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
    %4 = util.optimization_barrier %3 : !stream.resource<constant>
    %5 = stream.resource.subview %2#1[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
    %6 = util.optimization_barrier %5 : !stream.resource<constant>
    %7 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %8 = stream.cmd.execute with(%7 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %7 : !stream.resource<transient>{%c4}
    %10 = util.optimization_barrier %9 : !stream.resource<transient>
    %11 = stream.resource.size %4 : !stream.resource<constant>
    %12 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %13 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%11}, %6 as %arg1: !stream.resource<constant>{%c12}, %10 as %arg2: !stream.resource<transient>{%c4}, %12 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %11] : !stream.resource<constant>{%11},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %14 = stream.timepoint.await %13 => %12 : !stream.resource<external>{%c8}
    %15 = stream.tensor.export %14 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %16 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
    %17 = stream.tensor.export %16 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%15, %17) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump Before FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.global private @_constant_0 : !stream.resource<constant>
  util.global private @_constant_1 : !stream.resource<constant>
  util.initializer {
    %0 = stream.timepoint.immediate => !stream.timepoint
    %c192 = arith.constant 192 : index
    %c0 = arith.constant 0 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
    } else {
      %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
    }
    util.global.store %1#0, @_constant : !stream.resource<constant>
    util.global.store %1#0, @_constant_0 : !stream.resource<constant>
    util.global.store %1#0, @_constant_1 : !stream.resource<constant>
    util.global.store %1#1, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %9 : tensor<f32>, tensor<2x1xf32>) outs(%7 : tensor<2x1xf32>) {
        ^bb0(%in: f32, %in_0: f32, %out: f32):
          %11 = arith.addf %in, %in_0 : f32
          linalg.yield %11 : f32
        } -> tensor<2x1xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [2, 1], strides = [1, 1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        return
      }
    }
  }
  func.func private @_tensor_float() {
    %c64 = arith.constant 64 : index
    %c24 = arith.constant 24 : index
    %c128 = arith.constant 128 : index
    %c192 = arith.constant 192 : index
    %c4 = arith.constant 4 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c12 = arith.constant 12 : index
    %c8 = arith.constant 8 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %_constant__timepoint = util.global.load @_constant__timepoint : !stream.timepoint
    %_constant = util.global.load @_constant : !stream.resource<constant>
    %_constant_0 = util.global.load @_constant_0 : !stream.resource<constant>
    %_constant_1 = util.global.load @_constant_1 : !stream.resource<constant>
    %0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %1 = stream.cmd.execute await(%_constant__timepoint) => with(%_constant as %arg0: !stream.resource<constant>{%c192}, %0 as %arg1: !stream.resource<external>{%c8}) {
      stream.cmd.copy %arg0[%c0], %arg1[%c0], %c8 : !stream.resource<constant>{%c192} -> !stream.resource<external>{%c8}
    } => !stream.timepoint
    %2:2 = stream.timepoint.await %_constant__timepoint => %_constant_1, %_constant_0 : !stream.resource<constant>{%c192}, !stream.resource<constant>{%c192}
    %3 = stream.resource.subview %2#0[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c24}
    %4 = util.optimization_barrier %3 : !stream.resource<constant>
    %5 = stream.resource.subview %2#1[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c12}
    %6 = util.optimization_barrier %5 : !stream.resource<constant>
    %7 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c4}
    %8 = stream.cmd.execute with(%7 as %arg0: !stream.resource<transient>{%c4}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c4] : i32 -> !stream.resource<transient>{%c4}
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %7 : !stream.resource<transient>{%c4}
    %10 = util.optimization_barrier %9 : !stream.resource<transient>
    %11 = stream.resource.size %4 : !stream.resource<constant>
    %12 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8}
    %13 = stream.cmd.execute with(%4 as %arg0: !stream.resource<constant>{%11}, %6 as %arg1: !stream.resource<constant>{%c12}, %10 as %arg2: !stream.resource<transient>{%c4}, %12 as %arg3: !stream.resource<external>{%c8}) {
      stream.cmd.dispatch @_tensor_float_dispatch_0::@_tensor_float_dispatch_0_matmul_2x1x3[%c2, %c1] {
        ro %arg0[%c0 for %11] : !stream.resource<constant>{%11},
        ro %arg1[%c0 for %c12] : !stream.resource<constant>{%c12},
        ro %arg2[%c0 for %c4] : !stream.resource<transient>{%c4},
        wo %arg3[%c0 for %c8] : !stream.resource<external>{%c8}
      }
    } => !stream.timepoint
    %14 = stream.timepoint.await %13 => %12 : !stream.resource<external>{%c8}
    %15 = stream.tensor.export %14 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    %16 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c8}
    %17 = stream.tensor.export %16 : tensor<2x1xf32> in !stream.resource<external>{%c8} -> tensor<2x1xf32>
    check.expect_eq(%15, %17) : tensor<2x1xf32>
    return
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#composite_of_192b = #util.composite<192xi8, [
    dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>,
    dense<0> : vector<56xi8>,
    dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>,
    dense<0> : vector<52xi8>,
    dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>,
    dense<0> : vector<40xi8>,
]>
#map = affine_map<(d0, d1) -> ()>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
  util.global private mutable @_constant__timepoint = #stream.timepoint<immediate> : !stream.timepoint
  util.global private @_constant : !stream.resource<constant>
  util.initializer {
    %0 = stream.timepoint.immediate => !stream.timepoint
    %c192 = arith.constant 192 : index
    %c0 = arith.constant 0 : index
    %buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_192b
    %did_map, %result = stream.resource.try_map %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c192}
    %1:2 = scf.if %did_map -> (!stream.resource<constant>, !stream.timepoint) {
      scf.yield %result, %0 : !stream.resource<constant>, !stream.timepoint
    } else {
      %2 = stream.resource.map %buffer_cst[%c0] : !util.buffer -> !stream.resource<staging>{%c192}
      %3 = stream.resource.alloc uninitialized : !stream.resource<constant>{%c192}
      %4 = stream.cmd.execute with(%2 as %arg0: !stream.resource<staging>{%c192}, %3 as %arg1: !stream.resource<constant>{%c192}) {
        stream.cmd.copy %arg0[%c0], %arg1[%c0], %c192 : !stream.resource<staging>{%c192} -> !stream.resource<constant>{%c192}
      } => !stream.timepoint
      scf.yield %3, %4 : !stream.resource<constant>, !stream.timepoint
    }
    util.global.store %1#0, @_constant : !stream.resource<constant>
    util.global.store %1#1, @_constant__timepoint : !stream.timepoint
    util.initializer.return
  }
  func.func @tensor_float() attributes {iree.abi.stub} {
    call @_tensor_float() : () -> ()
    return
  }
  stream.executable private @_tensor_float_dispatch_0 {
    stream.executable.export public @_tensor_float_dispatch_0_matmul_2x1x3 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_tensor_float_dispatch_0_matmul_2x1x3(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2x3xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3x1xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<f32>>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2x1xf32>>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x3xf32>> -> tensor<2x3xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1xf32>> -> tensor<3x1xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %7 = tensor.empty() : tensor<2x1xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<2x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<2x1xf32>) -> tensor<2x1xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map1, #map1], i