Skip to content

Instantly share code, notes, and snippets.

@ThomasRaoux
Created September 17, 2021 15:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ThomasRaoux/8bded9d7c3f7426fc1ca8598deb53220 to your computer and use it in GitHub Desktop.
Save ThomasRaoux/8bded9d7c3f7426fc1ca8598deb53220 to your computer and use it in GitHub Desktop.
Gemm dump after all IREE
This file has been truncated, but you can view the full file.
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
%1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq_const(%2, dense<4.095960e+02> : tensor<2048x512xf32>) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Inliner //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After SymbolDCE //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After TopLevelSCFToCFG //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After TopLevelSCFToCFG //----- //
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ConvertShapeToStandard //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Inliner //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After LegalizeInputTypes //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = "mhlo.dot"(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%cst_2 = constant 0.000000e+00 : f32
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After IREEImportPublic //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After VerifyInputLegality //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After VerifyInputLegality //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ExpandGlobalDynamicDims //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After InterchangeGenericOps //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After InterchangeGenericOps //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After FusionOfTensorOps //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After FusionOfTensorOps //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- //
func private @_large_aligned() {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_1 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_2 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_1) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_2 = constant 0.000000e+00 : f32
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%3 = linalg.fill(%cst_2, %2) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
%4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%4, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch.workgroups[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst_2 = constant 0.000000e+00 : f32
%c2048_3 = constant 2048 : index
%c512_4 = constant 512 : index
%3 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %4 to %c2048_3 step %5 {
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %6 to %c512_4 step %7 {
%8 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 2048)>(%arg3, %workgroup_size_1)
%9 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%10 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 512)>(%arg4, %workgroup_size_0)
%11 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%12 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 2048)>(%arg3, %workgroup_size_1)
%13 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 512)>(%arg4, %workgroup_size_0)
%14 = affine.min affine_map<(d0, d1) -> (-d0 + 2048, d1)>(%arg3, %workgroup_size_1)
%15 = affine.min affine_map<(d0, d1) -> (-d0 + 512, d1)>(%arg4, %workgroup_size_0)
%16 = tensor.extract_slice %3[%arg3, %arg4] [%14, %15] [1, 1] : tensor<2048x512xf32> to tensor<?x?xf32>
%17 = linalg.fill(%cst_2, %16) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%17 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %18, %arg2, offsets = [%arg3, %arg4], sizes = [%12, %13], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
flow.return
}
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d1, -d0 + 2048)>
#map2 = affine_map<(d0, d1) -> (d1, -d0 + 512)>
#map3 = affine_map<(d0, d1) -> (-d0 + 2048, d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 512, d1)>
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch.workgroups[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst_2 = constant 0.000000e+00 : f32
%c2048_3 = constant 2048 : index
%c512_4 = constant 512 : index
%3 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%4 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%5 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %4 to %c2048_3 step %5 {
%6 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%7 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %6 to %c512_4 step %7 {
%8 = affine.min #map1(%arg3, %workgroup_size_1)
%9 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%10 = affine.min #map2(%arg4, %workgroup_size_0)
%11 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%12 = affine.min #map1(%arg3, %workgroup_size_1)
%13 = affine.min #map2(%arg4, %workgroup_size_0)
%14 = affine.min #map3(%arg3, %workgroup_size_1)
%15 = affine.min #map4(%arg4, %workgroup_size_0)
%16 = tensor.extract_slice %3[%arg3, %arg4] [%14, %15] [1, 1] : tensor<2048x512xf32> to tensor<?x?xf32>
%17 = linalg.fill(%cst_2, %16) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%17 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %18, %arg2, offsets = [%arg3, %arg4], sizes = [%12, %13], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
flow.return
}
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch.workgroups[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst_2 = constant 0.000000e+00 : f32
%c2048_3 = constant 2048 : index
%c512_4 = constant 512 : index
%3 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %4 to %c2048_3 step %5 {
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %6 to %c512_4 step %7 {
%8 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 2048)>(%arg3, %workgroup_size_1)
%9 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%10 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 512)>(%arg4, %workgroup_size_0)
%11 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%12 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 2048)>(%arg3, %workgroup_size_1)
%13 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 512)>(%arg4, %workgroup_size_0)
%14 = affine.min affine_map<(d0, d1) -> (-d0 + 2048, d1)>(%arg3, %workgroup_size_1)
%15 = affine.min affine_map<(d0, d1) -> (-d0 + 512, d1)>(%arg4, %workgroup_size_0)
%16 = tensor.extract_slice %3[%arg3, %arg4] [%14, %15] [1, 1] : tensor<2048x512xf32> to tensor<?x?xf32>
%17 = linalg.fill(%cst_2, %16) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%17 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %18, %arg2, offsets = [%arg3, %arg4], sizes = [%12, %13], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
flow.return
}
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch.workgroups[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512_2 = constant 512 : index
%c2048_3 = constant 2048 : index
%cst_4 = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %3 to %c2048_3 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %5 to %c512_2 step %6 {
%7 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 2048)>(%arg3, %workgroup_size_1)
%8 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%7, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%9 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 512)>(%arg4, %workgroup_size_0)
%10 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%11 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 2048)>(%arg3, %workgroup_size_1)
%12 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 512)>(%arg4, %workgroup_size_0)
%13 = affine.min affine_map<(d0, d1) -> (-d0 + 2048, d1)>(%arg3, %workgroup_size_1)
%14 = affine.min affine_map<(d0, d1) -> (-d0 + 512, d1)>(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.fill(%cst_4, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
flow.return
}
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After OutlineDispatchRegions //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d1, -d0 + 2048)>
#map2 = affine_map<(d0, d1) -> (d1, -d0 + 512)>
#map3 = affine_map<(d0, d1) -> (-d0 + 2048, d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 512, d1)>
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3, %workgroup_size_1)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4, %workgroup_size_0)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3, %workgroup_size_1)
%9 = affine.min #map2(%arg4, %workgroup_size_0)
%10 = affine.min #map3(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After DeduplicateExecutables //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d1, -d0 + 2048)>
#map2 = affine_map<(d0, d1) -> (d1, -d0 + 512)>
#map3 = affine_map<(d0, d1) -> (-d0 + 2048, d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 512, d1)>
module {
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3, %workgroup_size_1)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4, %workgroup_size_0)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3, %workgroup_size_1)
%9 = affine.min #map2(%arg4, %workgroup_size_0)
%10 = affine.min #map3(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After HoistUnstreamableOps //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After HoistUnstreamableOps //----- //
func private @_large_aligned() {
%cst = constant dense<4.095960e+02> : tensor<2048x512xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%0 = util.do_not_optimize(%cst_1) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst) : tensor<2048x512xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After InsertConstantClones //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After FormStreams //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After InsertConstantClones //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After FormStreams //----- //
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant dense<1.000000e+00> : tensor<2048x1024xf32>
%cst_0 = constant dense<4.000000e-01> : tensor<1024x512xf32>
%cst_1 = constant dense<4.095960e+02> : tensor<2048x512xf32>
%0 = util.do_not_optimize(%cst) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%cst_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%c512, %c2048, %c1, %0, %1) : (index, index, index, tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: index, %arg1: index, %arg2: index, %arg3: tensor<2048x1024xf32>, %arg4: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%arg0, %arg1, %arg2](%arg3, %arg4) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %cst_1) : tensor<2048x512xf32>
return
}
// -----// IR Dump After OutlineLargeConstants //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d1, -d0 + 2048)>
#map2 = affine_map<(d0, d1) -> (d1, -d0 + 512)>
#map3 = affine_map<(d0, d1) -> (-d0 + 2048, d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 512, d1)>
module {
util.global private @_large_const_0 {noinline} = dense<1.000000e+00> : tensor<2048x1024xf32>
util.global private @_large_const_1 {noinline} = dense<4.000000e-01> : tensor<1024x512xf32>
util.global private @_large_const_2 {noinline} = dense<4.095960e+02> : tensor<2048x512xf32>
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3, %workgroup_size_1)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4, %workgroup_size_0)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3, %workgroup_size_1)
%9 = affine.min #map2(%arg4, %workgroup_size_0)
%10 = affine.min #map3(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%c1 = constant 1 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%c512, %c2048, %c1, %0, %1) : (index, index, index, tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: index, %arg1: index, %arg2: index, %arg3: tensor<2048x1024xf32>, %arg4: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%arg0, %arg1, %arg2](%arg3, %arg4) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After CSE //----- //
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
// -----// IR Dump After Canonicalizer //----- //
func private @_large_aligned() {
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%c1 = constant 1 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After CSE //----- //
func private @_large_aligned() {
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%c1 = constant 1 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
// -----// IR Dump After SymbolDCE //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d1, -d0 + 2048)>
#map2 = affine_map<(d0, d1) -> (d1, -d0 + 512)>
#map3 = affine_map<(d0, d1) -> (-d0 + 2048, d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 512, d1)>
module {
util.global private @_large_const_0 {noinline} = dense<1.000000e+00> : tensor<2048x1024xf32>
util.global private @_large_const_1 {noinline} = dense<4.000000e-01> : tensor<1024x512xf32>
util.global private @_large_const_2 {noinline} = dense<4.095960e+02> : tensor<2048x512xf32>
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3, %workgroup_size_1)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4, %workgroup_size_0)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3, %workgroup_size_1)
%9 = affine.min #map2(%arg4, %workgroup_size_0)
%10 = affine.min #map3(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%c1 = constant 1 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module {
util.global private @_large_const_0 {noinline} = dense<1.000000e+00> : tensor<2048x1024xf32>
util.global private @_large_const_1 {noinline} = dense<4.000000e-01> : tensor<1024x512xf32>
util.global private @_large_const_2 {noinline} = dense<4.095960e+02> : tensor<2048x512xf32>
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c1 = constant 1 : index
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
util.global private @_large_const_0 {noinline} = dense<1.000000e+00> : tensor<2048x1024xf32>
util.global private @_large_const_1 {noinline} = dense<4.000000e-01> : tensor<1024x512xf32>
util.global private @_large_const_2 {noinline} = dense<4.095960e+02> : tensor<2048x512xf32>
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c1 = constant 1 : index
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
util.global private @_large_const_0 {noinline} = dense<1.000000e+00> : tensor<2048x1024xf32>
util.global private @_large_const_1 {noinline} = dense<4.000000e-01> : tensor<1024x512xf32>
util.global private @_large_const_2 {noinline} = dense<4.095960e+02> : tensor<2048x512xf32>
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%_large_const_0 = util.global.load @_large_const_0 : tensor<2048x1024xf32>
%_large_const_1 = util.global.load @_large_const_1 : tensor<1024x512xf32>
%_large_const_2 = util.global.load @_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%_large_const_0) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%_large_const_1) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c1 = constant 1 : index
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %_large_const_2) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
hal.constant_pool private @_const_pool attributes {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 16, max_buffer_range = 1073741824, min_buffer_range_alignment = 16>} {
hal.constant_pool.value nested @_large_const_0 = dense<1.000000e+00> : tensor<2048x1024xf32>
hal.constant_pool.value nested @_large_const_1 = dense<4.000000e-01> : tensor<1024x512xf32>
hal.constant_pool.value nested @_large_const_2 = dense<4.095960e+02> : tensor<2048x512xf32>
}
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%const = hal.constant_pool.load @_const_pool::@_large_const_0 : tensor<2048x1024xf32>
%const_0 = hal.constant_pool.load @_const_pool::@_large_const_1 : tensor<1024x512xf32>
%const_1 = hal.constant_pool.load @_const_pool::@_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%const) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%const_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c1 = constant 1 : index
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %const_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PackConstantPoolStoragePass //----- //
hal.constant_pool private @_const_pool attributes {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 16, max_buffer_range = 1073741824, min_buffer_range_alignment = 16>} {
hal.constant_pool.splat nested @_large_const_0 = dense<1.000000e+00> : tensor<2048x1024xf32>
hal.constant_pool.splat nested @_large_const_1 = dense<4.000000e-01> : tensor<1024x512xf32>
hal.constant_pool.splat nested @_large_const_2 = dense<4.095960e+02> : tensor<2048x512xf32>
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
hal.constant_pool private @_const_pool attributes {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 16, max_buffer_range = 1073741824, min_buffer_range_alignment = 16>} {
hal.constant_pool.splat nested @_large_const_0 = dense<1.000000e+00> : tensor<2048x1024xf32> -> @_const_pool_splats[#util.byte_range<0, 8388608>]
hal.constant_pool.splat nested @_large_const_1 = dense<4.000000e-01> : tensor<1024x512xf32> -> @_const_pool_splats[#util.byte_range<8388608, 2097152>]
hal.constant_pool.splat nested @_large_const_2 = dense<4.095960e+02> : tensor<2048x512xf32> -> @_const_pool_splats[#util.byte_range<10485760, 4194304>]
}
util.global private @_const_pool_splats : !hal.buffer
util.initializer {
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%c14680064 = constant 14680064 : index
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Constant|Transfer|Mapping|Dispatch") : !hal.buffer{%c14680064}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%c0 = constant 0 : index
%c8388608 = constant 8388608 : index
%c1065353216_i32 = constant 1065353216 : i32
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c8388608] pattern(%c1065353216_i32 : i32)
%c8388608_0 = constant 8388608 : index
%c2097152 = constant 2097152 : index
%c1053609165_i32 = constant 1053609165 : i32
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c8388608_0, %c2097152] pattern(%c1053609165_i32 : i32)
%c10485760 = constant 10485760 : index
%c4194304 = constant 4194304 : index
%c1137495114_i32 = constant 1137495114 : i32
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c10485760, %c4194304] pattern(%c1137495114_i32 : i32)
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
util.global.store %buffer, @_const_pool_splats : !hal.buffer
util.initializer.return
}
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%const = hal.constant_pool.load @_const_pool::@_large_const_0 : tensor<2048x1024xf32>
%const_0 = hal.constant_pool.load @_const_pool::@_large_const_1 : tensor<1024x512xf32>
%const_1 = hal.constant_pool.load @_const_pool::@_large_const_2 : tensor<2048x512xf32>
%0 = util.do_not_optimize(%const) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%const_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c1 = constant 1 : index
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %const_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
hal.constant_pool private @_const_pool attributes {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 16, max_buffer_range = 1073741824, min_buffer_range_alignment = 16>} {
hal.constant_pool.splat nested @_large_const_0 = dense<1.000000e+00> : tensor<2048x1024xf32> -> @_const_pool_splats[#util.byte_range<0, 8388608>]
hal.constant_pool.splat nested @_large_const_1 = dense<4.000000e-01> : tensor<1024x512xf32> -> @_const_pool_splats[#util.byte_range<8388608, 2097152>]
hal.constant_pool.splat nested @_large_const_2 = dense<4.095960e+02> : tensor<2048x512xf32> -> @_const_pool_splats[#util.byte_range<10485760, 4194304>]
}
util.global private @_const_pool_splats : !hal.buffer
util.initializer {
%c1137495114_i32 = constant 1137495114 : i32
%c4194304 = constant 4194304 : index
%c10485760 = constant 10485760 : index
%c1053609165_i32 = constant 1053609165 : i32
%c2097152 = constant 2097152 : index
%c8388608 = constant 8388608 : index
%c1065353216_i32 = constant 1065353216 : i32
%c0 = constant 0 : index
%c14680064 = constant 14680064 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Constant|Transfer|Mapping|Dispatch") : !hal.buffer{%c14680064}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c8388608] pattern(%c1065353216_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c8388608, %c2097152] pattern(%c1053609165_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c10485760, %c4194304] pattern(%c1137495114_i32 : i32)
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
util.global.store %buffer, @_const_pool_splats : !hal.buffer
util.initializer.return
}
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%const_span = hal.constant.subspan @_const_pool_splats[#util.byte_range<0, 8388608>] : tensor<2048x1024xf32>
%const_span_0 = hal.constant.subspan @_const_pool_splats[#util.byte_range<8388608, 2097152>] : tensor<1024x512xf32>
%const_span_1 = hal.constant.subspan @_const_pool_splats[#util.byte_range<10485760, 4194304>] : tensor<2048x512xf32>
%0 = util.do_not_optimize(%const_span) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%const_span_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%c1 = constant 1 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %const_span_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After SymbolDCE //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
util.global private @_const_pool_splats : !hal.buffer
util.initializer {
%c1137495114_i32 = constant 1137495114 : i32
%c4194304 = constant 4194304 : index
%c10485760 = constant 10485760 : index
%c1053609165_i32 = constant 1053609165 : i32
%c2097152 = constant 2097152 : index
%c8388608 = constant 8388608 : index
%c1065353216_i32 = constant 1065353216 : i32
%c0 = constant 0 : index
%c14680064 = constant 14680064 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Constant|Transfer|Mapping|Dispatch") : !hal.buffer{%c14680064}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c8388608] pattern(%c1065353216_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c8388608, %c2097152] pattern(%c1053609165_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c10485760, %c4194304] pattern(%c1137495114_i32 : i32)
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
util.global.store %buffer, @_const_pool_splats : !hal.buffer
util.initializer.return
}
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
flow.executable private @_large_aligned_dispatch_0 {
flow.dispatch.entry public @_large_aligned_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @_large_aligned_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:2048x1024xf32>, %arg1: !flow.dispatch.tensor<readonly:1024x512xf32>, %arg2: !flow.dispatch.tensor<writeonly:2048x512xf32>) {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c2048 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c512 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%6 = affine.min #map2(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [1024, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map2(%arg4)[%workgroup_size_0]
%10 = affine.min #map3(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = linalg.fill(%cst, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
}
}
func private @_large_aligned() {
%const_span = hal.constant.subspan @_const_pool_splats[#util.byte_range<0, 8388608>] : tensor<2048x1024xf32>
%const_span_0 = hal.constant.subspan @_const_pool_splats[#util.byte_range<8388608, 2097152>] : tensor<1024x512xf32>
%const_span_1 = hal.constant.subspan @_const_pool_splats[#util.byte_range<10485760, 4194304>] : tensor<2048x512xf32>
%0 = util.do_not_optimize(%const_span) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%const_span_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%c1 = constant 1 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %const_span_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
util.global private @_const_pool_splats : !hal.buffer
util.initializer {
%c1137495114_i32 = constant 1137495114 : i32
%c4194304 = constant 4194304 : index
%c10485760 = constant 10485760 : index
%c1053609165_i32 = constant 1053609165 : i32
%c2097152 = constant 2097152 : index
%c8388608 = constant 8388608 : index
%c1065353216_i32 = constant 1065353216 : i32
%c0 = constant 0 : index
%c14680064 = constant 14680064 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Constant|Transfer|Mapping|Dispatch") : !hal.buffer{%c14680064}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c8388608] pattern(%c1065353216_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c8388608, %c2097152] pattern(%c1053609165_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c10485760, %c4194304] pattern(%c1137495114_i32 : i32)
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
util.global.store %buffer, @_const_pool_splats : !hal.buffer
util.initializer.return
}
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
hal.executable private @_large_aligned_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @cuda_nvptx_fb, target = #executable_target_cuda_nvptx_fb {
hal.executable.entry_point public @_large_aligned_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @_large_aligned_dispatch_0() {
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = affine.min #map1(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%9 = affine.min #map2(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%11 = affine.min #map1(%arg0)[%workgroup_size_y]
%12 = affine.min #map2(%arg1)[%workgroup_size_x]
%13 = affine.min #map3(%arg0)[%workgroup_size_y]
%14 = affine.min #map4(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func private @_large_aligned() {
%const_span = hal.constant.subspan @_const_pool_splats[#util.byte_range<0, 8388608>] : tensor<2048x1024xf32>
%const_span_0 = hal.constant.subspan @_const_pool_splats[#util.byte_range<8388608, 2097152>] : tensor<1024x512xf32>
%const_span_1 = hal.constant.subspan @_const_pool_splats[#util.byte_range<10485760, 4194304>] : tensor<2048x512xf32>
%0 = util.do_not_optimize(%const_span) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%const_span_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%c1 = constant 1 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %const_span_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb">]}>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 2048)>
#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 512)>
#map3 = affine_map<(d0)[s0] -> (-d0 + 2048, s0)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 512, s0)>
module attributes {hal.device.targets = [#device_target_cuda]} {
util.global private @_const_pool_splats : !hal.buffer
util.initializer {
%c14680064 = constant 14680064 : index
%c0 = constant 0 : index
%c1065353216_i32 = constant 1065353216 : i32
%c8388608 = constant 8388608 : index
%c2097152 = constant 2097152 : index
%c1053609165_i32 = constant 1053609165 : i32
%c10485760 = constant 10485760 : index
%c4194304 = constant 4194304 : index
%c1137495114_i32 = constant 1137495114 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Constant|Transfer|Mapping|Dispatch") : !hal.buffer{%c14680064}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Transfer) : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c0, %c8388608] pattern(%c1065353216_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c8388608, %c2097152] pattern(%c1053609165_i32 : i32)
hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer> target(%buffer : !hal.buffer)[%c10485760, %c4194304] pattern(%c1137495114_i32 : i32)
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
util.global.store %buffer, @_const_pool_splats : !hal.buffer
util.initializer.return
}
func @large_aligned() attributes {iree.abi.stub} {
call @_large_aligned() : () -> ()
return
}
hal.executable private @_large_aligned_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @cuda_nvptx_fb, target = #executable_target_cuda_nvptx_fb {
hal.executable.entry_point public @_large_aligned_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @_large_aligned_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = affine.min #map1(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%9 = affine.min #map2(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%11 = affine.min #map1(%arg0)[%workgroup_size_y]
%12 = affine.min #map2(%arg1)[%workgroup_size_x]
%13 = affine.min #map3(%arg0)[%workgroup_size_y]
%14 = affine.min #map4(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func private @_large_aligned() {
%const_span = hal.constant.subspan @_const_pool_splats[#util.byte_range<0, 8388608>] : tensor<2048x1024xf32>
%const_span_0 = hal.constant.subspan @_const_pool_splats[#util.byte_range<8388608, 2097152>] : tensor<1024x512xf32>
%const_span_1 = hal.constant.subspan @_const_pool_splats[#util.byte_range<10485760, 4194304>] : tensor<2048x512xf32>
%0 = util.do_not_optimize(%const_span) : tensor<2048x1024xf32>
%1 = util.do_not_optimize(%const_span_0) : tensor<1024x512xf32>
%2 = flow.ex.stream.fragment(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
(%arg0: tensor<2048x1024xf32>, %arg1: tensor<1024x512xf32>) -> tensor<2048x512xf32> {
%c1 = constant 1 : index
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%3 = flow.dispatch @_large_aligned_dispatch_0::@_large_aligned_dispatch_0[%c512, %c2048, %c1](%arg0, %arg1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
flow.return %3 : tensor<2048x512xf32>
}
check.expect_almost_eq(%2, %const_span_1) : tensor<2048x512xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @cuda_nvptx_fb, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
hal.executable.entry_point public @_large_aligned_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @_large_aligned_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2048)>(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<?x1024xf32>
%9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 512)>(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x?xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2048)>(%arg0)[%workgroup_size_y]
%12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 512)>(%arg1)[%workgroup_size_x]
%13 = affine.min affine_map<(d0)[s0] -> (-d0 + 2048, s0)>(%arg0)[%workgroup_size_y]
%14 = affine.min affine_map<(d0)[s0] -> (-d0 + 512, s0)>(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After SetNumWorkgroups //----- //
hal.executable.variant public @cuda_nvptx_fb, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
hal.executable.entry_point public @_large_aligned_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = 4 : i32, workloadPerWorkgroup = [128, 64]}, workgroup_size = [32 : index, 4 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @_large_aligned_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%c128 = constant 128 : index
%c64 = constant 64 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c128]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c128]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<64x1024xf32>
%8 = tensor.cast %7 : tensor<64x1024xf32> to tensor<?x1024xf32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x128xf32>
%10 = tensor.cast %9 : tensor<1024x128xf32> to tensor<1024x?xf32>
%11 = linalg.init_tensor [%c64, %c128] : tensor<?x?xf32>
%12 = linalg.fill(%cst, %11) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%13 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%8, %10 : tensor<?x1024xf32>, tensor<1024x?xf32>) outs(%12 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%c64, %c128], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
hal.executable.variant public @cuda_nvptx_fb, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
hal.executable.entry_point public @_large_aligned_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = 4 : i32, workloadPerWorkgroup = [128, 64]}, workgroup_size = [32 : index, 4 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @_large_aligned_dispatch_0() {
%c64 = constant 64 : index
%c128 = constant 128 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<64x1024xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x128xf32>
%9 = linalg.init_tensor [64, 128] : tensor<64x128xf32>
%10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, tensor<64x128xf32> -> tensor<64x128xf32>
%11 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%7, %8 : tensor<64x1024xf32>, tensor<1024x128xf32>) outs(%10 : tensor<64x128xf32>) -> tensor<64x128xf32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c64, %c128], strides = [1, 1] : tensor<64x128xf32> -> !flow.dispatch.tensor<writeonly:2048x512xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After LinalgBufferize //----- //
func @_large_aligned_dispatch_0() {
%c64 = constant 64 : index
%c128 = constant 128 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c2048 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c512 step %9 {
%10 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%11 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [64, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:2048x1024xf32> -> tensor<64x1024xf32>
%12 = memref.subview %2[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%13 = flow.dispatch.tensor.load %3, offsets = [0, %arg1], sizes = [1024, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x128xf32>
%14 = linalg.init_tensor [64, 128] : tensor<64x128xf32>
%c64_0 = constant 64 : index
%c128_1 = constant 128 : index
%15 = memref.subview %4[%arg0, %arg1] [%c64_0, %c128_1] [1, 1] : memref<2048x512xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %15) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%16 = linalg.fill(%cst, %14) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, tensor<64x128xf32> -> tensor<64x128xf32>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%10, %12 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%15 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%11, %13 : tensor<64x1024xf32>, tensor<1024x128xf32>) outs(%16 : tensor<64x128xf32>) -> tensor<64x128xf32>
}
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @_large_aligned_dispatch_0() {
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%c64 = constant 64 : index
%c128 = constant 128 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c2048 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c512 step %9 {
%10 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%11 = memref.subview %2[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%12 = memref.subview %4[%arg0, %arg1] [%c64, %c128] [1, 1] : memref<2048x512xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %12) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%10, %11 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%12 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
func @_large_aligned_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c2048 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c512 step %9 {
%10 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%11 = memref.subview %2[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%12 = memref.subview %4[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %12) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%10, %11 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%12 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After CSE //----- //
func @_large_aligned_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:2048x1024xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c2048 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c512 step %9 {
%10 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%11 = memref.subview %2[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%12 = memref.subview %4[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %12) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%10, %11 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%12 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView //----- //
func @_large_aligned_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%8 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%9 = memref.subview %2[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%7, %8 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%9 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @_large_aligned_dispatch_0() {
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%8 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%9 = memref.subview %2[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%7, %8 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%9 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @_large_aligned_dispatch_0() {
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c2048 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c512 step %6 {
%7 = memref.subview %0[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%8 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%9 = memref.subview %2[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%7, %8 : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>) outs(%9 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After LLVMGPUTileAndDistribute //----- //
func @_large_aligned_dispatch_0() {
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1024 = constant 1024 : index
%c128 = constant 128 : index
%c64 = constant 64 : index
%0 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%1 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%2 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %5 to %c2048 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %7 to %c512 step %8 {
%9 = memref.subview %2[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%10 = memref.subview %3[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%11 = memref.subview %4[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%13]
scf.for %arg2 = %14 to %c64 step %c64 {
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
scf.for %arg3 = %15 to %c128 step %c128 {
%16 = memref.subview %11[%arg2, %arg3] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %c1024 step %c8 {
%15 = memref.subview %9[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%16 = memref.subview %10[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%17 = memref.subview %1[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%18 = memref.subview %0[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
linalg.copy(%15, %17) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
linalg.copy(%16, %18) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%19 = "gpu.thread_id"() {dimension = "x"} : () -> index
%20 = "gpu.thread_id"() {dimension = "y"} : () -> index
%21 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%20]
scf.for %arg3 = %21 to %c64 step %c64 {
%22 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
scf.for %arg4 = %22 to %c128 step %c128 {
%23 = memref.subview %17[%arg3, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%24 = memref.subview %18[0, %arg4] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%25 = memref.subview %11[%arg3, %arg4] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%23, %24 : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>) outs(%25 : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
}
}
}
return
}
// -----// IR Dump After LLVMGPUDistributeSharedMemoryCopy //----- //
func @_large_aligned_dispatch_0() {
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1024 = constant 1024 : index
%c128 = constant 128 : index
%c64 = constant 64 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = "gpu.thread_id"() {dimension = "x"} : () -> index
%16 = "gpu.thread_id"() {dimension = "y"} : () -> index
%17 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%16]
scf.for %arg2 = %17 to %c64 step %c64 {
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%15]
scf.for %arg3 = %18 to %c128 step %c128 {
%19 = memref.subview %14[%arg2, %arg3] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %19) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %c1024 step %c8 {
%18 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%19 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%20 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%21 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%24 = vector.transfer_read %18[%22, %23], %cst {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
vector.transfer_write %24, %20[%25, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%28 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%29 = vector.transfer_read %19[%27, %28], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%30 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%31 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%32 = vector.transfer_read %19[%30, %31], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%33 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%34 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
vector.transfer_write %29, %21[%33, %34] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%35 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%36 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
vector.transfer_write %32, %21[%35, %36] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%37 = "gpu.thread_id"() {dimension = "x"} : () -> index
%38 = "gpu.thread_id"() {dimension = "y"} : () -> index
%39 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%38]
scf.for %arg3 = %39 to %c64 step %c64 {
%40 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%37]
scf.for %arg4 = %40 to %c128 step %c128 {
%41 = memref.subview %20[%arg3, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%42 = memref.subview %21[0, %arg4] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%43 = memref.subview %14[%arg3, %arg4] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%41, %42 : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>) outs(%43 : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
memref.global "private" @__shared_memory___0 : memref<8x128xf32, 3>
memref.global "private" @__shared_memory__ : memref<64x8xf32, 3>
func @_large_aligned_dispatch_0() {
%c64 = constant 64 : index
%c128 = constant 128 : index
%c1024 = constant 1024 : index
%c8 = constant 8 : index
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = "gpu.thread_id"() {dimension = "x"} : () -> index
%16 = "gpu.thread_id"() {dimension = "y"} : () -> index
%17 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%16]
scf.for %arg2 = %17 to %c64 step %c64 {
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%15]
scf.for %arg3 = %18 to %c128 step %c128 {
%19 = memref.subview %14[%arg2, %arg3] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %19) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %c1024 step %c8 {
%18 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%19 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%20 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%21 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%24 = vector.transfer_read %18[%22, %23], %cst {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
vector.transfer_write %24, %20[%25, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%28 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%29 = vector.transfer_read %19[%27, %28], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%30 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%31 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%32 = vector.transfer_read %19[%30, %31], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%33 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%34 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
vector.transfer_write %29, %21[%33, %34] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%35 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%36 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
vector.transfer_write %32, %21[%35, %36] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%37 = "gpu.thread_id"() {dimension = "x"} : () -> index
%38 = "gpu.thread_id"() {dimension = "y"} : () -> index
%39 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%38]
scf.for %arg3 = %39 to %c64 step %c64 {
%40 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%37]
scf.for %arg4 = %40 to %c128 step %c128 {
%41 = memref.subview %20[%arg3, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%42 = memref.subview %21[0, %arg4] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%43 = memref.subview %14[%arg3, %arg4] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%41, %42 : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>) outs(%43 : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
memref.global "private" @__shared_memory___0 : memref<8x128xf32, 3>
memref.global "private" @__shared_memory__ : memref<64x8xf32, 3>
func @_large_aligned_dispatch_0() {
%c64 = constant 64 : index
%c128 = constant 128 : index
%c1024 = constant 1024 : index
%c8 = constant 8 : index
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
scf.for %arg2 = %15 to %c64 step %c64 {
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
scf.for %arg3 = %16 to %c128 step %c128 {
%17 = memref.subview %14[%arg2, %arg3] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %17) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %c1024 step %c8 {
%16 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%17 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%18 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%19 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%21 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%22 = vector.transfer_read %16[%20, %21], %cst {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %22, %18[%20, %21] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%25 = vector.transfer_read %17[%23, %24], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%27 = vector.transfer_read %17[%26, %24], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %25, %19[%23, %24] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %27, %19[%26, %24] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
scf.for %arg3 = %15 to %c64 step %c64 {
%28 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
scf.for %arg4 = %28 to %c128 step %c128 {
%29 = memref.subview %18[%arg3, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%30 = memref.subview %19[0, %arg4] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%31 = memref.subview %14[%arg3, %arg4] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%29, %30 : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>) outs(%31 : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After LLVMGPURemoveSingleIterationLoop //----- //
func @_large_aligned_dispatch_0() {
%c1024 = constant 1024 : index
%c8 = constant 8 : index
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%17 = memref.subview %14[%15, %16] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.fill(%cst, %17) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} : f32, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
scf.for %arg2 = %c0 to %c1024 step %c8 {
%18 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%19 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%20 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%21 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%24 = vector.transfer_read %18[%22, %23], %cst {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %24, %20[%22, %23] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%27 = vector.transfer_read %19[%25, %26], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%28 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%29 = vector.transfer_read %19[%28, %26], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %27, %21[%25, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %29, %21[%28, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%31 = memref.subview %20[%15, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%32 = memref.subview %21[0, %30] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%33 = memref.subview %14[%15, %30] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[64, 128, 8], [], [16, 4]]}} ins(%31, %32 : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>) outs(%33 : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
}
}
}
return
}
// -----// IR Dump After LLVMGPUVectorization //----- //
func @_large_aligned_dispatch_0() {
%c1024 = constant 1024 : index
%c8 = constant 8 : index
%cst = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c0 = constant 0 : index
%cst_0 = constant dense<0.000000e+00> : vector<16x4xf32>
%c1 = constant 1 : index
%c2 = constant 2 : index
%c3 = constant 3 : index
%c4 = constant 4 : index
%c5 = constant 5 : index
%c6 = constant 6 : index
%c7 = constant 7 : index
%c9 = constant 9 : index
%c10 = constant 10 : index
%c11 = constant 11 : index
%c12 = constant 12 : index
%c13 = constant 13 : index
%c14 = constant 14 : index
%c15 = constant 15 : index
%cst_1 = constant dense<0.000000e+00> : vector<1x4xf32>
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%17 = memref.subview %14[%15, %16] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%18 = vector.extract_strided_slice %cst_0 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %18, %17[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%19 = vector.extract_strided_slice %cst_0 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %19, %17[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%20 = vector.extract_strided_slice %cst_0 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %20, %17[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%21 = vector.extract_strided_slice %cst_0 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %21, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%22 = vector.extract_strided_slice %cst_0 {offsets = [4, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %22, %17[%c4, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%23 = vector.extract_strided_slice %cst_0 {offsets = [5, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %23, %17[%c5, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%24 = vector.extract_strided_slice %cst_0 {offsets = [6, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %24, %17[%c6, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%25 = vector.extract_strided_slice %cst_0 {offsets = [7, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %25, %17[%c7, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%26 = vector.extract_strided_slice %cst_0 {offsets = [8, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %26, %17[%c8, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%27 = vector.extract_strided_slice %cst_0 {offsets = [9, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %27, %17[%c9, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%28 = vector.extract_strided_slice %cst_0 {offsets = [10, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %28, %17[%c10, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%29 = vector.extract_strided_slice %cst_0 {offsets = [11, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %29, %17[%c11, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%30 = vector.extract_strided_slice %cst_0 {offsets = [12, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %30, %17[%c12, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%31 = vector.extract_strided_slice %cst_0 {offsets = [13, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %31, %17[%c13, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%32 = vector.extract_strided_slice %cst_0 {offsets = [14, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %32, %17[%c14, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%33 = vector.extract_strided_slice %cst_0 {offsets = [15, 0], sizes = [1, 4], strides = [1, 1]} : vector<16x4xf32> to vector<1x4xf32>
vector.transfer_write %33, %17[%c15, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
scf.for %arg2 = %c0 to %c1024 step %c8 {
%34 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%35 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%36 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%37 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%38 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%39 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%40 = vector.transfer_read %34[%38, %39], %cst {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %40, %36[%38, %39] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%41 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%42 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%43 = vector.transfer_read %35[%41, %42], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%44 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%45 = vector.transfer_read %35[%44, %42], %cst {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %43, %37[%41, %42] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %45, %37[%44, %42] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%46 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%47 = memref.subview %36[%15, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%48 = memref.subview %37[0, %46] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%49 = memref.subview %14[%15, %46] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%50 = vector.transfer_read %47[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%51 = vector.transfer_read %47[%c0, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%52 = vector.transfer_read %47[%c1, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%53 = vector.transfer_read %47[%c1, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%54 = vector.transfer_read %47[%c2, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%55 = vector.transfer_read %47[%c2, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%56 = vector.transfer_read %47[%c3, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%57 = vector.transfer_read %47[%c3, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%58 = vector.transfer_read %47[%c4, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%59 = vector.transfer_read %47[%c4, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%60 = vector.transfer_read %47[%c5, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%61 = vector.transfer_read %47[%c5, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%62 = vector.transfer_read %47[%c6, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%63 = vector.transfer_read %47[%c6, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%64 = vector.transfer_read %47[%c7, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%65 = vector.transfer_read %47[%c7, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%66 = vector.transfer_read %47[%c8, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%67 = vector.transfer_read %47[%c8, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%68 = vector.transfer_read %47[%c9, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%69 = vector.transfer_read %47[%c9, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%70 = vector.transfer_read %47[%c10, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%71 = vector.transfer_read %47[%c10, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%72 = vector.transfer_read %47[%c11, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%73 = vector.transfer_read %47[%c11, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%74 = vector.transfer_read %47[%c12, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%75 = vector.transfer_read %47[%c12, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%76 = vector.transfer_read %47[%c13, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%77 = vector.transfer_read %47[%c13, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%78 = vector.transfer_read %47[%c14, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%79 = vector.transfer_read %47[%c14, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%80 = vector.transfer_read %47[%c15, %c0], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%81 = vector.transfer_read %47[%c15, %c4], %cst {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%82 = vector.transfer_read %48[%c0, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%83 = vector.transpose %82, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%84 = vector.transfer_read %48[%c1, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%85 = vector.transpose %84, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%86 = vector.transfer_read %48[%c2, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%87 = vector.transpose %86, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%88 = vector.transfer_read %48[%c3, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%89 = vector.transpose %88, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%90 = vector.transfer_read %48[%c4, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%91 = vector.transpose %90, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%92 = vector.transfer_read %48[%c5, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%93 = vector.transpose %92, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%94 = vector.transfer_read %48[%c6, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%95 = vector.transpose %94, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%96 = vector.transfer_read %48[%c7, %c0], %cst {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%97 = vector.transpose %96, [1, 0] : vector<1x4xf32> to vector<4x1xf32>
%98 = vector.transfer_read %49[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%99 = vector.transfer_read %49[%c1, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%100 = vector.transfer_read %49[%c2, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%101 = vector.transfer_read %49[%c3, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%102 = vector.transfer_read %49[%c4, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%103 = vector.transfer_read %49[%c5, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%104 = vector.transfer_read %49[%c6, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%105 = vector.transfer_read %49[%c7, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%106 = vector.transfer_read %49[%c8, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%107 = vector.transfer_read %49[%c9, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%108 = vector.transfer_read %49[%c10, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%109 = vector.transfer_read %49[%c11, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%110 = vector.transfer_read %49[%c12, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%111 = vector.transfer_read %49[%c13, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%112 = vector.transfer_read %49[%c14, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%113 = vector.transfer_read %49[%c15, %c0], %cst {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%114 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%115 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%116 = vector.extract %115[0] : vector<1x4xf32>
%117 = vector.extract %114[0, 0] : vector<1x1xf32>
%118 = splat %117 : vector<4xf32>
%119 = vector.extract %98[0] : vector<1x4xf32>
%120 = vector.fma %118, %116, %119 : vector<4xf32>
%121 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%122 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%123 = vector.extract %122[0] : vector<1x4xf32>
%124 = vector.extract %121[0, 0] : vector<1x1xf32>
%125 = splat %124 : vector<4xf32>
%126 = vector.fma %125, %123, %120 : vector<4xf32>
%127 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%128 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%129 = vector.extract %128[0] : vector<1x4xf32>
%130 = vector.extract %127[0, 0] : vector<1x1xf32>
%131 = splat %130 : vector<4xf32>
%132 = vector.fma %131, %129, %126 : vector<4xf32>
%133 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%134 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%135 = vector.extract %134[0] : vector<1x4xf32>
%136 = vector.extract %133[0, 0] : vector<1x1xf32>
%137 = splat %136 : vector<4xf32>
%138 = vector.fma %137, %135, %132 : vector<4xf32>
%139 = vector.extract_strided_slice %51 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%140 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%141 = vector.extract %140[0] : vector<1x4xf32>
%142 = vector.extract %139[0, 0] : vector<1x1xf32>
%143 = splat %142 : vector<4xf32>
%144 = vector.fma %143, %141, %138 : vector<4xf32>
%145 = vector.extract_strided_slice %51 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%146 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%147 = vector.extract %146[0] : vector<1x4xf32>
%148 = vector.extract %145[0, 0] : vector<1x1xf32>
%149 = splat %148 : vector<4xf32>
%150 = vector.fma %149, %147, %144 : vector<4xf32>
%151 = vector.extract_strided_slice %51 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%152 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%153 = vector.extract %152[0] : vector<1x4xf32>
%154 = vector.extract %151[0, 0] : vector<1x1xf32>
%155 = splat %154 : vector<4xf32>
%156 = vector.fma %155, %153, %150 : vector<4xf32>
%157 = vector.extract_strided_slice %51 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%158 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%159 = vector.extract %158[0] : vector<1x4xf32>
%160 = vector.extract %157[0, 0] : vector<1x1xf32>
%161 = splat %160 : vector<4xf32>
%162 = vector.fma %161, %159, %156 : vector<4xf32>
%163 = vector.insert %162, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%164 = vector.extract_strided_slice %52 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%165 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%166 = vector.extract %165[0] : vector<1x4xf32>
%167 = vector.extract %164[0, 0] : vector<1x1xf32>
%168 = splat %167 : vector<4xf32>
%169 = vector.extract %99[0] : vector<1x4xf32>
%170 = vector.fma %168, %166, %169 : vector<4xf32>
%171 = vector.extract_strided_slice %52 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%172 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%173 = vector.extract %172[0] : vector<1x4xf32>
%174 = vector.extract %171[0, 0] : vector<1x1xf32>
%175 = splat %174 : vector<4xf32>
%176 = vector.fma %175, %173, %170 : vector<4xf32>
%177 = vector.extract_strided_slice %52 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%178 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%179 = vector.extract %178[0] : vector<1x4xf32>
%180 = vector.extract %177[0, 0] : vector<1x1xf32>
%181 = splat %180 : vector<4xf32>
%182 = vector.fma %181, %179, %176 : vector<4xf32>
%183 = vector.extract_strided_slice %52 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%184 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%185 = vector.extract %184[0] : vector<1x4xf32>
%186 = vector.extract %183[0, 0] : vector<1x1xf32>
%187 = splat %186 : vector<4xf32>
%188 = vector.fma %187, %185, %182 : vector<4xf32>
%189 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%190 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%191 = vector.extract %190[0] : vector<1x4xf32>
%192 = vector.extract %189[0, 0] : vector<1x1xf32>
%193 = splat %192 : vector<4xf32>
%194 = vector.fma %193, %191, %188 : vector<4xf32>
%195 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%196 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%197 = vector.extract %196[0] : vector<1x4xf32>
%198 = vector.extract %195[0, 0] : vector<1x1xf32>
%199 = splat %198 : vector<4xf32>
%200 = vector.fma %199, %197, %194 : vector<4xf32>
%201 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%202 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%203 = vector.extract %202[0] : vector<1x4xf32>
%204 = vector.extract %201[0, 0] : vector<1x1xf32>
%205 = splat %204 : vector<4xf32>
%206 = vector.fma %205, %203, %200 : vector<4xf32>
%207 = vector.extract_strided_slice %53 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%208 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%209 = vector.extract %208[0] : vector<1x4xf32>
%210 = vector.extract %207[0, 0] : vector<1x1xf32>
%211 = splat %210 : vector<4xf32>
%212 = vector.fma %211, %209, %206 : vector<4xf32>
%213 = vector.insert %212, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%214 = vector.extract_strided_slice %54 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%215 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%216 = vector.extract %215[0] : vector<1x4xf32>
%217 = vector.extract %214[0, 0] : vector<1x1xf32>
%218 = splat %217 : vector<4xf32>
%219 = vector.extract %100[0] : vector<1x4xf32>
%220 = vector.fma %218, %216, %219 : vector<4xf32>
%221 = vector.extract_strided_slice %54 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%222 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%223 = vector.extract %222[0] : vector<1x4xf32>
%224 = vector.extract %221[0, 0] : vector<1x1xf32>
%225 = splat %224 : vector<4xf32>
%226 = vector.fma %225, %223, %220 : vector<4xf32>
%227 = vector.extract_strided_slice %54 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%228 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%229 = vector.extract %228[0] : vector<1x4xf32>
%230 = vector.extract %227[0, 0] : vector<1x1xf32>
%231 = splat %230 : vector<4xf32>
%232 = vector.fma %231, %229, %226 : vector<4xf32>
%233 = vector.extract_strided_slice %54 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%234 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%235 = vector.extract %234[0] : vector<1x4xf32>
%236 = vector.extract %233[0, 0] : vector<1x1xf32>
%237 = splat %236 : vector<4xf32>
%238 = vector.fma %237, %235, %232 : vector<4xf32>
%239 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%240 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%241 = vector.extract %240[0] : vector<1x4xf32>
%242 = vector.extract %239[0, 0] : vector<1x1xf32>
%243 = splat %242 : vector<4xf32>
%244 = vector.fma %243, %241, %238 : vector<4xf32>
%245 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%246 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%247 = vector.extract %246[0] : vector<1x4xf32>
%248 = vector.extract %245[0, 0] : vector<1x1xf32>
%249 = splat %248 : vector<4xf32>
%250 = vector.fma %249, %247, %244 : vector<4xf32>
%251 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%252 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%253 = vector.extract %252[0] : vector<1x4xf32>
%254 = vector.extract %251[0, 0] : vector<1x1xf32>
%255 = splat %254 : vector<4xf32>
%256 = vector.fma %255, %253, %250 : vector<4xf32>
%257 = vector.extract_strided_slice %55 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%258 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%259 = vector.extract %258[0] : vector<1x4xf32>
%260 = vector.extract %257[0, 0] : vector<1x1xf32>
%261 = splat %260 : vector<4xf32>
%262 = vector.fma %261, %259, %256 : vector<4xf32>
%263 = vector.insert %262, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%264 = vector.extract_strided_slice %56 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%265 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%266 = vector.extract %265[0] : vector<1x4xf32>
%267 = vector.extract %264[0, 0] : vector<1x1xf32>
%268 = splat %267 : vector<4xf32>
%269 = vector.extract %101[0] : vector<1x4xf32>
%270 = vector.fma %268, %266, %269 : vector<4xf32>
%271 = vector.extract_strided_slice %56 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%272 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%273 = vector.extract %272[0] : vector<1x4xf32>
%274 = vector.extract %271[0, 0] : vector<1x1xf32>
%275 = splat %274 : vector<4xf32>
%276 = vector.fma %275, %273, %270 : vector<4xf32>
%277 = vector.extract_strided_slice %56 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%278 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%279 = vector.extract %278[0] : vector<1x4xf32>
%280 = vector.extract %277[0, 0] : vector<1x1xf32>
%281 = splat %280 : vector<4xf32>
%282 = vector.fma %281, %279, %276 : vector<4xf32>
%283 = vector.extract_strided_slice %56 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%284 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%285 = vector.extract %284[0] : vector<1x4xf32>
%286 = vector.extract %283[0, 0] : vector<1x1xf32>
%287 = splat %286 : vector<4xf32>
%288 = vector.fma %287, %285, %282 : vector<4xf32>
%289 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%290 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%291 = vector.extract %290[0] : vector<1x4xf32>
%292 = vector.extract %289[0, 0] : vector<1x1xf32>
%293 = splat %292 : vector<4xf32>
%294 = vector.fma %293, %291, %288 : vector<4xf32>
%295 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%296 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%297 = vector.extract %296[0] : vector<1x4xf32>
%298 = vector.extract %295[0, 0] : vector<1x1xf32>
%299 = splat %298 : vector<4xf32>
%300 = vector.fma %299, %297, %294 : vector<4xf32>
%301 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%302 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%303 = vector.extract %302[0] : vector<1x4xf32>
%304 = vector.extract %301[0, 0] : vector<1x1xf32>
%305 = splat %304 : vector<4xf32>
%306 = vector.fma %305, %303, %300 : vector<4xf32>
%307 = vector.extract_strided_slice %57 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%308 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%309 = vector.extract %308[0] : vector<1x4xf32>
%310 = vector.extract %307[0, 0] : vector<1x1xf32>
%311 = splat %310 : vector<4xf32>
%312 = vector.fma %311, %309, %306 : vector<4xf32>
%313 = vector.insert %312, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%314 = vector.extract_strided_slice %58 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%315 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%316 = vector.extract %315[0] : vector<1x4xf32>
%317 = vector.extract %314[0, 0] : vector<1x1xf32>
%318 = splat %317 : vector<4xf32>
%319 = vector.extract %102[0] : vector<1x4xf32>
%320 = vector.fma %318, %316, %319 : vector<4xf32>
%321 = vector.extract_strided_slice %58 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%322 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%323 = vector.extract %322[0] : vector<1x4xf32>
%324 = vector.extract %321[0, 0] : vector<1x1xf32>
%325 = splat %324 : vector<4xf32>
%326 = vector.fma %325, %323, %320 : vector<4xf32>
%327 = vector.extract_strided_slice %58 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%328 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%329 = vector.extract %328[0] : vector<1x4xf32>
%330 = vector.extract %327[0, 0] : vector<1x1xf32>
%331 = splat %330 : vector<4xf32>
%332 = vector.fma %331, %329, %326 : vector<4xf32>
%333 = vector.extract_strided_slice %58 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%334 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%335 = vector.extract %334[0] : vector<1x4xf32>
%336 = vector.extract %333[0, 0] : vector<1x1xf32>
%337 = splat %336 : vector<4xf32>
%338 = vector.fma %337, %335, %332 : vector<4xf32>
%339 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%340 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%341 = vector.extract %340[0] : vector<1x4xf32>
%342 = vector.extract %339[0, 0] : vector<1x1xf32>
%343 = splat %342 : vector<4xf32>
%344 = vector.fma %343, %341, %338 : vector<4xf32>
%345 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%346 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%347 = vector.extract %346[0] : vector<1x4xf32>
%348 = vector.extract %345[0, 0] : vector<1x1xf32>
%349 = splat %348 : vector<4xf32>
%350 = vector.fma %349, %347, %344 : vector<4xf32>
%351 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%352 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%353 = vector.extract %352[0] : vector<1x4xf32>
%354 = vector.extract %351[0, 0] : vector<1x1xf32>
%355 = splat %354 : vector<4xf32>
%356 = vector.fma %355, %353, %350 : vector<4xf32>
%357 = vector.extract_strided_slice %59 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%358 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%359 = vector.extract %358[0] : vector<1x4xf32>
%360 = vector.extract %357[0, 0] : vector<1x1xf32>
%361 = splat %360 : vector<4xf32>
%362 = vector.fma %361, %359, %356 : vector<4xf32>
%363 = vector.insert %362, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%364 = vector.extract_strided_slice %60 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%365 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%366 = vector.extract %365[0] : vector<1x4xf32>
%367 = vector.extract %364[0, 0] : vector<1x1xf32>
%368 = splat %367 : vector<4xf32>
%369 = vector.extract %103[0] : vector<1x4xf32>
%370 = vector.fma %368, %366, %369 : vector<4xf32>
%371 = vector.extract_strided_slice %60 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%372 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%373 = vector.extract %372[0] : vector<1x4xf32>
%374 = vector.extract %371[0, 0] : vector<1x1xf32>
%375 = splat %374 : vector<4xf32>
%376 = vector.fma %375, %373, %370 : vector<4xf32>
%377 = vector.extract_strided_slice %60 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%378 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%379 = vector.extract %378[0] : vector<1x4xf32>
%380 = vector.extract %377[0, 0] : vector<1x1xf32>
%381 = splat %380 : vector<4xf32>
%382 = vector.fma %381, %379, %376 : vector<4xf32>
%383 = vector.extract_strided_slice %60 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%384 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%385 = vector.extract %384[0] : vector<1x4xf32>
%386 = vector.extract %383[0, 0] : vector<1x1xf32>
%387 = splat %386 : vector<4xf32>
%388 = vector.fma %387, %385, %382 : vector<4xf32>
%389 = vector.extract_strided_slice %61 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%390 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%391 = vector.extract %390[0] : vector<1x4xf32>
%392 = vector.extract %389[0, 0] : vector<1x1xf32>
%393 = splat %392 : vector<4xf32>
%394 = vector.fma %393, %391, %388 : vector<4xf32>
%395 = vector.extract_strided_slice %61 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%396 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%397 = vector.extract %396[0] : vector<1x4xf32>
%398 = vector.extract %395[0, 0] : vector<1x1xf32>
%399 = splat %398 : vector<4xf32>
%400 = vector.fma %399, %397, %394 : vector<4xf32>
%401 = vector.extract_strided_slice %61 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%402 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%403 = vector.extract %402[0] : vector<1x4xf32>
%404 = vector.extract %401[0, 0] : vector<1x1xf32>
%405 = splat %404 : vector<4xf32>
%406 = vector.fma %405, %403, %400 : vector<4xf32>
%407 = vector.extract_strided_slice %61 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%408 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%409 = vector.extract %408[0] : vector<1x4xf32>
%410 = vector.extract %407[0, 0] : vector<1x1xf32>
%411 = splat %410 : vector<4xf32>
%412 = vector.fma %411, %409, %406 : vector<4xf32>
%413 = vector.insert %412, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%414 = vector.extract_strided_slice %62 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%415 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%416 = vector.extract %415[0] : vector<1x4xf32>
%417 = vector.extract %414[0, 0] : vector<1x1xf32>
%418 = splat %417 : vector<4xf32>
%419 = vector.extract %104[0] : vector<1x4xf32>
%420 = vector.fma %418, %416, %419 : vector<4xf32>
%421 = vector.extract_strided_slice %62 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%422 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%423 = vector.extract %422[0] : vector<1x4xf32>
%424 = vector.extract %421[0, 0] : vector<1x1xf32>
%425 = splat %424 : vector<4xf32>
%426 = vector.fma %425, %423, %420 : vector<4xf32>
%427 = vector.extract_strided_slice %62 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%428 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%429 = vector.extract %428[0] : vector<1x4xf32>
%430 = vector.extract %427[0, 0] : vector<1x1xf32>
%431 = splat %430 : vector<4xf32>
%432 = vector.fma %431, %429, %426 : vector<4xf32>
%433 = vector.extract_strided_slice %62 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%434 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%435 = vector.extract %434[0] : vector<1x4xf32>
%436 = vector.extract %433[0, 0] : vector<1x1xf32>
%437 = splat %436 : vector<4xf32>
%438 = vector.fma %437, %435, %432 : vector<4xf32>
%439 = vector.extract_strided_slice %63 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%440 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%441 = vector.extract %440[0] : vector<1x4xf32>
%442 = vector.extract %439[0, 0] : vector<1x1xf32>
%443 = splat %442 : vector<4xf32>
%444 = vector.fma %443, %441, %438 : vector<4xf32>
%445 = vector.extract_strided_slice %63 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%446 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%447 = vector.extract %446[0] : vector<1x4xf32>
%448 = vector.extract %445[0, 0] : vector<1x1xf32>
%449 = splat %448 : vector<4xf32>
%450 = vector.fma %449, %447, %444 : vector<4xf32>
%451 = vector.extract_strided_slice %63 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%452 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%453 = vector.extract %452[0] : vector<1x4xf32>
%454 = vector.extract %451[0, 0] : vector<1x1xf32>
%455 = splat %454 : vector<4xf32>
%456 = vector.fma %455, %453, %450 : vector<4xf32>
%457 = vector.extract_strided_slice %63 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%458 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%459 = vector.extract %458[0] : vector<1x4xf32>
%460 = vector.extract %457[0, 0] : vector<1x1xf32>
%461 = splat %460 : vector<4xf32>
%462 = vector.fma %461, %459, %456 : vector<4xf32>
%463 = vector.insert %462, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%464 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%465 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%466 = vector.extract %465[0] : vector<1x4xf32>
%467 = vector.extract %464[0, 0] : vector<1x1xf32>
%468 = splat %467 : vector<4xf32>
%469 = vector.extract %105[0] : vector<1x4xf32>
%470 = vector.fma %468, %466, %469 : vector<4xf32>
%471 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%472 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%473 = vector.extract %472[0] : vector<1x4xf32>
%474 = vector.extract %471[0, 0] : vector<1x1xf32>
%475 = splat %474 : vector<4xf32>
%476 = vector.fma %475, %473, %470 : vector<4xf32>
%477 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%478 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%479 = vector.extract %478[0] : vector<1x4xf32>
%480 = vector.extract %477[0, 0] : vector<1x1xf32>
%481 = splat %480 : vector<4xf32>
%482 = vector.fma %481, %479, %476 : vector<4xf32>
%483 = vector.extract_strided_slice %64 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%484 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%485 = vector.extract %484[0] : vector<1x4xf32>
%486 = vector.extract %483[0, 0] : vector<1x1xf32>
%487 = splat %486 : vector<4xf32>
%488 = vector.fma %487, %485, %482 : vector<4xf32>
%489 = vector.extract_strided_slice %65 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%490 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%491 = vector.extract %490[0] : vector<1x4xf32>
%492 = vector.extract %489[0, 0] : vector<1x1xf32>
%493 = splat %492 : vector<4xf32>
%494 = vector.fma %493, %491, %488 : vector<4xf32>
%495 = vector.extract_strided_slice %65 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%496 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%497 = vector.extract %496[0] : vector<1x4xf32>
%498 = vector.extract %495[0, 0] : vector<1x1xf32>
%499 = splat %498 : vector<4xf32>
%500 = vector.fma %499, %497, %494 : vector<4xf32>
%501 = vector.extract_strided_slice %65 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%502 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%503 = vector.extract %502[0] : vector<1x4xf32>
%504 = vector.extract %501[0, 0] : vector<1x1xf32>
%505 = splat %504 : vector<4xf32>
%506 = vector.fma %505, %503, %500 : vector<4xf32>
%507 = vector.extract_strided_slice %65 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%508 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%509 = vector.extract %508[0] : vector<1x4xf32>
%510 = vector.extract %507[0, 0] : vector<1x1xf32>
%511 = splat %510 : vector<4xf32>
%512 = vector.fma %511, %509, %506 : vector<4xf32>
%513 = vector.insert %512, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%514 = vector.extract_strided_slice %66 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%515 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%516 = vector.extract %515[0] : vector<1x4xf32>
%517 = vector.extract %514[0, 0] : vector<1x1xf32>
%518 = splat %517 : vector<4xf32>
%519 = vector.extract %106[0] : vector<1x4xf32>
%520 = vector.fma %518, %516, %519 : vector<4xf32>
%521 = vector.extract_strided_slice %66 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%522 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%523 = vector.extract %522[0] : vector<1x4xf32>
%524 = vector.extract %521[0, 0] : vector<1x1xf32>
%525 = splat %524 : vector<4xf32>
%526 = vector.fma %525, %523, %520 : vector<4xf32>
%527 = vector.extract_strided_slice %66 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%528 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%529 = vector.extract %528[0] : vector<1x4xf32>
%530 = vector.extract %527[0, 0] : vector<1x1xf32>
%531 = splat %530 : vector<4xf32>
%532 = vector.fma %531, %529, %526 : vector<4xf32>
%533 = vector.extract_strided_slice %66 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%534 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%535 = vector.extract %534[0] : vector<1x4xf32>
%536 = vector.extract %533[0, 0] : vector<1x1xf32>
%537 = splat %536 : vector<4xf32>
%538 = vector.fma %537, %535, %532 : vector<4xf32>
%539 = vector.extract_strided_slice %67 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%540 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%541 = vector.extract %540[0] : vector<1x4xf32>
%542 = vector.extract %539[0, 0] : vector<1x1xf32>
%543 = splat %542 : vector<4xf32>
%544 = vector.fma %543, %541, %538 : vector<4xf32>
%545 = vector.extract_strided_slice %67 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%546 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%547 = vector.extract %546[0] : vector<1x4xf32>
%548 = vector.extract %545[0, 0] : vector<1x1xf32>
%549 = splat %548 : vector<4xf32>
%550 = vector.fma %549, %547, %544 : vector<4xf32>
%551 = vector.extract_strided_slice %67 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%552 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%553 = vector.extract %552[0] : vector<1x4xf32>
%554 = vector.extract %551[0, 0] : vector<1x1xf32>
%555 = splat %554 : vector<4xf32>
%556 = vector.fma %555, %553, %550 : vector<4xf32>
%557 = vector.extract_strided_slice %67 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%558 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%559 = vector.extract %558[0] : vector<1x4xf32>
%560 = vector.extract %557[0, 0] : vector<1x1xf32>
%561 = splat %560 : vector<4xf32>
%562 = vector.fma %561, %559, %556 : vector<4xf32>
%563 = vector.insert %562, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%564 = vector.extract_strided_slice %68 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%565 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%566 = vector.extract %565[0] : vector<1x4xf32>
%567 = vector.extract %564[0, 0] : vector<1x1xf32>
%568 = splat %567 : vector<4xf32>
%569 = vector.extract %107[0] : vector<1x4xf32>
%570 = vector.fma %568, %566, %569 : vector<4xf32>
%571 = vector.extract_strided_slice %68 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%572 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%573 = vector.extract %572[0] : vector<1x4xf32>
%574 = vector.extract %571[0, 0] : vector<1x1xf32>
%575 = splat %574 : vector<4xf32>
%576 = vector.fma %575, %573, %570 : vector<4xf32>
%577 = vector.extract_strided_slice %68 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%578 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%579 = vector.extract %578[0] : vector<1x4xf32>
%580 = vector.extract %577[0, 0] : vector<1x1xf32>
%581 = splat %580 : vector<4xf32>
%582 = vector.fma %581, %579, %576 : vector<4xf32>
%583 = vector.extract_strided_slice %68 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%584 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%585 = vector.extract %584[0] : vector<1x4xf32>
%586 = vector.extract %583[0, 0] : vector<1x1xf32>
%587 = splat %586 : vector<4xf32>
%588 = vector.fma %587, %585, %582 : vector<4xf32>
%589 = vector.extract_strided_slice %69 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%590 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%591 = vector.extract %590[0] : vector<1x4xf32>
%592 = vector.extract %589[0, 0] : vector<1x1xf32>
%593 = splat %592 : vector<4xf32>
%594 = vector.fma %593, %591, %588 : vector<4xf32>
%595 = vector.extract_strided_slice %69 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%596 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%597 = vector.extract %596[0] : vector<1x4xf32>
%598 = vector.extract %595[0, 0] : vector<1x1xf32>
%599 = splat %598 : vector<4xf32>
%600 = vector.fma %599, %597, %594 : vector<4xf32>
%601 = vector.extract_strided_slice %69 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%602 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%603 = vector.extract %602[0] : vector<1x4xf32>
%604 = vector.extract %601[0, 0] : vector<1x1xf32>
%605 = splat %604 : vector<4xf32>
%606 = vector.fma %605, %603, %600 : vector<4xf32>
%607 = vector.extract_strided_slice %69 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%608 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%609 = vector.extract %608[0] : vector<1x4xf32>
%610 = vector.extract %607[0, 0] : vector<1x1xf32>
%611 = splat %610 : vector<4xf32>
%612 = vector.fma %611, %609, %606 : vector<4xf32>
%613 = vector.insert %612, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%614 = vector.extract_strided_slice %70 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%615 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%616 = vector.extract %615[0] : vector<1x4xf32>
%617 = vector.extract %614[0, 0] : vector<1x1xf32>
%618 = splat %617 : vector<4xf32>
%619 = vector.extract %108[0] : vector<1x4xf32>
%620 = vector.fma %618, %616, %619 : vector<4xf32>
%621 = vector.extract_strided_slice %70 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%622 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%623 = vector.extract %622[0] : vector<1x4xf32>
%624 = vector.extract %621[0, 0] : vector<1x1xf32>
%625 = splat %624 : vector<4xf32>
%626 = vector.fma %625, %623, %620 : vector<4xf32>
%627 = vector.extract_strided_slice %70 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%628 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%629 = vector.extract %628[0] : vector<1x4xf32>
%630 = vector.extract %627[0, 0] : vector<1x1xf32>
%631 = splat %630 : vector<4xf32>
%632 = vector.fma %631, %629, %626 : vector<4xf32>
%633 = vector.extract_strided_slice %70 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%634 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%635 = vector.extract %634[0] : vector<1x4xf32>
%636 = vector.extract %633[0, 0] : vector<1x1xf32>
%637 = splat %636 : vector<4xf32>
%638 = vector.fma %637, %635, %632 : vector<4xf32>
%639 = vector.extract_strided_slice %71 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%640 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%641 = vector.extract %640[0] : vector<1x4xf32>
%642 = vector.extract %639[0, 0] : vector<1x1xf32>
%643 = splat %642 : vector<4xf32>
%644 = vector.fma %643, %641, %638 : vector<4xf32>
%645 = vector.extract_strided_slice %71 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%646 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%647 = vector.extract %646[0] : vector<1x4xf32>
%648 = vector.extract %645[0, 0] : vector<1x1xf32>
%649 = splat %648 : vector<4xf32>
%650 = vector.fma %649, %647, %644 : vector<4xf32>
%651 = vector.extract_strided_slice %71 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%652 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%653 = vector.extract %652[0] : vector<1x4xf32>
%654 = vector.extract %651[0, 0] : vector<1x1xf32>
%655 = splat %654 : vector<4xf32>
%656 = vector.fma %655, %653, %650 : vector<4xf32>
%657 = vector.extract_strided_slice %71 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%658 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%659 = vector.extract %658[0] : vector<1x4xf32>
%660 = vector.extract %657[0, 0] : vector<1x1xf32>
%661 = splat %660 : vector<4xf32>
%662 = vector.fma %661, %659, %656 : vector<4xf32>
%663 = vector.insert %662, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%664 = vector.extract_strided_slice %72 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%665 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%666 = vector.extract %665[0] : vector<1x4xf32>
%667 = vector.extract %664[0, 0] : vector<1x1xf32>
%668 = splat %667 : vector<4xf32>
%669 = vector.extract %109[0] : vector<1x4xf32>
%670 = vector.fma %668, %666, %669 : vector<4xf32>
%671 = vector.extract_strided_slice %72 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%672 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%673 = vector.extract %672[0] : vector<1x4xf32>
%674 = vector.extract %671[0, 0] : vector<1x1xf32>
%675 = splat %674 : vector<4xf32>
%676 = vector.fma %675, %673, %670 : vector<4xf32>
%677 = vector.extract_strided_slice %72 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%678 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%679 = vector.extract %678[0] : vector<1x4xf32>
%680 = vector.extract %677[0, 0] : vector<1x1xf32>
%681 = splat %680 : vector<4xf32>
%682 = vector.fma %681, %679, %676 : vector<4xf32>
%683 = vector.extract_strided_slice %72 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%684 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%685 = vector.extract %684[0] : vector<1x4xf32>
%686 = vector.extract %683[0, 0] : vector<1x1xf32>
%687 = splat %686 : vector<4xf32>
%688 = vector.fma %687, %685, %682 : vector<4xf32>
%689 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%690 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%691 = vector.extract %690[0] : vector<1x4xf32>
%692 = vector.extract %689[0, 0] : vector<1x1xf32>
%693 = splat %692 : vector<4xf32>
%694 = vector.fma %693, %691, %688 : vector<4xf32>
%695 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%696 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%697 = vector.extract %696[0] : vector<1x4xf32>
%698 = vector.extract %695[0, 0] : vector<1x1xf32>
%699 = splat %698 : vector<4xf32>
%700 = vector.fma %699, %697, %694 : vector<4xf32>
%701 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%702 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%703 = vector.extract %702[0] : vector<1x4xf32>
%704 = vector.extract %701[0, 0] : vector<1x1xf32>
%705 = splat %704 : vector<4xf32>
%706 = vector.fma %705, %703, %700 : vector<4xf32>
%707 = vector.extract_strided_slice %73 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%708 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%709 = vector.extract %708[0] : vector<1x4xf32>
%710 = vector.extract %707[0, 0] : vector<1x1xf32>
%711 = splat %710 : vector<4xf32>
%712 = vector.fma %711, %709, %706 : vector<4xf32>
%713 = vector.insert %712, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%714 = vector.extract_strided_slice %74 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%715 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%716 = vector.extract %715[0] : vector<1x4xf32>
%717 = vector.extract %714[0, 0] : vector<1x1xf32>
%718 = splat %717 : vector<4xf32>
%719 = vector.extract %110[0] : vector<1x4xf32>
%720 = vector.fma %718, %716, %719 : vector<4xf32>
%721 = vector.extract_strided_slice %74 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%722 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%723 = vector.extract %722[0] : vector<1x4xf32>
%724 = vector.extract %721[0, 0] : vector<1x1xf32>
%725 = splat %724 : vector<4xf32>
%726 = vector.fma %725, %723, %720 : vector<4xf32>
%727 = vector.extract_strided_slice %74 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%728 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%729 = vector.extract %728[0] : vector<1x4xf32>
%730 = vector.extract %727[0, 0] : vector<1x1xf32>
%731 = splat %730 : vector<4xf32>
%732 = vector.fma %731, %729, %726 : vector<4xf32>
%733 = vector.extract_strided_slice %74 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%734 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%735 = vector.extract %734[0] : vector<1x4xf32>
%736 = vector.extract %733[0, 0] : vector<1x1xf32>
%737 = splat %736 : vector<4xf32>
%738 = vector.fma %737, %735, %732 : vector<4xf32>
%739 = vector.extract_strided_slice %75 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%740 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%741 = vector.extract %740[0] : vector<1x4xf32>
%742 = vector.extract %739[0, 0] : vector<1x1xf32>
%743 = splat %742 : vector<4xf32>
%744 = vector.fma %743, %741, %738 : vector<4xf32>
%745 = vector.extract_strided_slice %75 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%746 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%747 = vector.extract %746[0] : vector<1x4xf32>
%748 = vector.extract %745[0, 0] : vector<1x1xf32>
%749 = splat %748 : vector<4xf32>
%750 = vector.fma %749, %747, %744 : vector<4xf32>
%751 = vector.extract_strided_slice %75 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%752 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%753 = vector.extract %752[0] : vector<1x4xf32>
%754 = vector.extract %751[0, 0] : vector<1x1xf32>
%755 = splat %754 : vector<4xf32>
%756 = vector.fma %755, %753, %750 : vector<4xf32>
%757 = vector.extract_strided_slice %75 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%758 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%759 = vector.extract %758[0] : vector<1x4xf32>
%760 = vector.extract %757[0, 0] : vector<1x1xf32>
%761 = splat %760 : vector<4xf32>
%762 = vector.fma %761, %759, %756 : vector<4xf32>
%763 = vector.insert %762, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%764 = vector.extract_strided_slice %76 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%765 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%766 = vector.extract %765[0] : vector<1x4xf32>
%767 = vector.extract %764[0, 0] : vector<1x1xf32>
%768 = splat %767 : vector<4xf32>
%769 = vector.extract %111[0] : vector<1x4xf32>
%770 = vector.fma %768, %766, %769 : vector<4xf32>
%771 = vector.extract_strided_slice %76 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%772 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%773 = vector.extract %772[0] : vector<1x4xf32>
%774 = vector.extract %771[0, 0] : vector<1x1xf32>
%775 = splat %774 : vector<4xf32>
%776 = vector.fma %775, %773, %770 : vector<4xf32>
%777 = vector.extract_strided_slice %76 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%778 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%779 = vector.extract %778[0] : vector<1x4xf32>
%780 = vector.extract %777[0, 0] : vector<1x1xf32>
%781 = splat %780 : vector<4xf32>
%782 = vector.fma %781, %779, %776 : vector<4xf32>
%783 = vector.extract_strided_slice %76 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%784 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%785 = vector.extract %784[0] : vector<1x4xf32>
%786 = vector.extract %783[0, 0] : vector<1x1xf32>
%787 = splat %786 : vector<4xf32>
%788 = vector.fma %787, %785, %782 : vector<4xf32>
%789 = vector.extract_strided_slice %77 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%790 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%791 = vector.extract %790[0] : vector<1x4xf32>
%792 = vector.extract %789[0, 0] : vector<1x1xf32>
%793 = splat %792 : vector<4xf32>
%794 = vector.fma %793, %791, %788 : vector<4xf32>
%795 = vector.extract_strided_slice %77 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%796 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%797 = vector.extract %796[0] : vector<1x4xf32>
%798 = vector.extract %795[0, 0] : vector<1x1xf32>
%799 = splat %798 : vector<4xf32>
%800 = vector.fma %799, %797, %794 : vector<4xf32>
%801 = vector.extract_strided_slice %77 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%802 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%803 = vector.extract %802[0] : vector<1x4xf32>
%804 = vector.extract %801[0, 0] : vector<1x1xf32>
%805 = splat %804 : vector<4xf32>
%806 = vector.fma %805, %803, %800 : vector<4xf32>
%807 = vector.extract_strided_slice %77 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%808 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%809 = vector.extract %808[0] : vector<1x4xf32>
%810 = vector.extract %807[0, 0] : vector<1x1xf32>
%811 = splat %810 : vector<4xf32>
%812 = vector.fma %811, %809, %806 : vector<4xf32>
%813 = vector.insert %812, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%814 = vector.extract_strided_slice %78 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%815 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%816 = vector.extract %815[0] : vector<1x4xf32>
%817 = vector.extract %814[0, 0] : vector<1x1xf32>
%818 = splat %817 : vector<4xf32>
%819 = vector.extract %112[0] : vector<1x4xf32>
%820 = vector.fma %818, %816, %819 : vector<4xf32>
%821 = vector.extract_strided_slice %78 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%822 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%823 = vector.extract %822[0] : vector<1x4xf32>
%824 = vector.extract %821[0, 0] : vector<1x1xf32>
%825 = splat %824 : vector<4xf32>
%826 = vector.fma %825, %823, %820 : vector<4xf32>
%827 = vector.extract_strided_slice %78 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%828 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%829 = vector.extract %828[0] : vector<1x4xf32>
%830 = vector.extract %827[0, 0] : vector<1x1xf32>
%831 = splat %830 : vector<4xf32>
%832 = vector.fma %831, %829, %826 : vector<4xf32>
%833 = vector.extract_strided_slice %78 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%834 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%835 = vector.extract %834[0] : vector<1x4xf32>
%836 = vector.extract %833[0, 0] : vector<1x1xf32>
%837 = splat %836 : vector<4xf32>
%838 = vector.fma %837, %835, %832 : vector<4xf32>
%839 = vector.extract_strided_slice %79 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%840 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%841 = vector.extract %840[0] : vector<1x4xf32>
%842 = vector.extract %839[0, 0] : vector<1x1xf32>
%843 = splat %842 : vector<4xf32>
%844 = vector.fma %843, %841, %838 : vector<4xf32>
%845 = vector.extract_strided_slice %79 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%846 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%847 = vector.extract %846[0] : vector<1x4xf32>
%848 = vector.extract %845[0, 0] : vector<1x1xf32>
%849 = splat %848 : vector<4xf32>
%850 = vector.fma %849, %847, %844 : vector<4xf32>
%851 = vector.extract_strided_slice %79 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%852 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%853 = vector.extract %852[0] : vector<1x4xf32>
%854 = vector.extract %851[0, 0] : vector<1x1xf32>
%855 = splat %854 : vector<4xf32>
%856 = vector.fma %855, %853, %850 : vector<4xf32>
%857 = vector.extract_strided_slice %79 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%858 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%859 = vector.extract %858[0] : vector<1x4xf32>
%860 = vector.extract %857[0, 0] : vector<1x1xf32>
%861 = splat %860 : vector<4xf32>
%862 = vector.fma %861, %859, %856 : vector<4xf32>
%863 = vector.insert %862, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%864 = vector.extract_strided_slice %80 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%865 = vector.transpose %83, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%866 = vector.extract %865[0] : vector<1x4xf32>
%867 = vector.extract %864[0, 0] : vector<1x1xf32>
%868 = splat %867 : vector<4xf32>
%869 = vector.extract %113[0] : vector<1x4xf32>
%870 = vector.fma %868, %866, %869 : vector<4xf32>
%871 = vector.extract_strided_slice %80 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%872 = vector.transpose %85, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%873 = vector.extract %872[0] : vector<1x4xf32>
%874 = vector.extract %871[0, 0] : vector<1x1xf32>
%875 = splat %874 : vector<4xf32>
%876 = vector.fma %875, %873, %870 : vector<4xf32>
%877 = vector.extract_strided_slice %80 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%878 = vector.transpose %87, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%879 = vector.extract %878[0] : vector<1x4xf32>
%880 = vector.extract %877[0, 0] : vector<1x1xf32>
%881 = splat %880 : vector<4xf32>
%882 = vector.fma %881, %879, %876 : vector<4xf32>
%883 = vector.extract_strided_slice %80 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%884 = vector.transpose %89, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%885 = vector.extract %884[0] : vector<1x4xf32>
%886 = vector.extract %883[0, 0] : vector<1x1xf32>
%887 = splat %886 : vector<4xf32>
%888 = vector.fma %887, %885, %882 : vector<4xf32>
%889 = vector.extract_strided_slice %81 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%890 = vector.transpose %91, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%891 = vector.extract %890[0] : vector<1x4xf32>
%892 = vector.extract %889[0, 0] : vector<1x1xf32>
%893 = splat %892 : vector<4xf32>
%894 = vector.fma %893, %891, %888 : vector<4xf32>
%895 = vector.extract_strided_slice %81 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%896 = vector.transpose %93, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%897 = vector.extract %896[0] : vector<1x4xf32>
%898 = vector.extract %895[0, 0] : vector<1x1xf32>
%899 = splat %898 : vector<4xf32>
%900 = vector.fma %899, %897, %894 : vector<4xf32>
%901 = vector.extract_strided_slice %81 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%902 = vector.transpose %95, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%903 = vector.extract %902[0] : vector<1x4xf32>
%904 = vector.extract %901[0, 0] : vector<1x1xf32>
%905 = splat %904 : vector<4xf32>
%906 = vector.fma %905, %903, %900 : vector<4xf32>
%907 = vector.extract_strided_slice %81 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%908 = vector.transpose %97, [1, 0] : vector<4x1xf32> to vector<1x4xf32>
%909 = vector.extract %908[0] : vector<1x4xf32>
%910 = vector.extract %907[0, 0] : vector<1x1xf32>
%911 = splat %910 : vector<4xf32>
%912 = vector.fma %911, %909, %906 : vector<4xf32>
%913 = vector.insert %912, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
vector.transfer_write %163, %49[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %213, %49[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %263, %49[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %313, %49[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %363, %49[%c4, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %413, %49[%c5, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %463, %49[%c6, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %513, %49[%c7, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %563, %49[%c8, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %613, %49[%c9, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %663, %49[%c10, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %713, %49[%c11, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %763, %49[%c12, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %813, %49[%c13, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %863, %49[%c14, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %913, %49[%c15, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @_large_aligned_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c15 = constant 15 : index
%c14 = constant 14 : index
%c13 = constant 13 : index
%c12 = constant 12 : index
%c11 = constant 11 : index
%c10 = constant 10 : index
%c9 = constant 9 : index
%c7 = constant 7 : index
%c6 = constant 6 : index
%c5 = constant 5 : index
%c4 = constant 4 : index
%c3 = constant 3 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst_0 = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1024 = constant 1024 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%17 = memref.subview %14[%15, %16] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c4, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c5, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c6, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c7, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c8, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c9, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c10, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c11, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c12, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c13, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c14, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c15, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
scf.for %arg2 = %c0 to %c1024 step %c8 {
%18 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%19 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%20 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%21 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%24 = vector.transfer_read %18[%22, %23], %cst_0 {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %24, %20[%22, %23] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%27 = vector.transfer_read %19[%25, %26], %cst_0 {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%28 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%29 = vector.transfer_read %19[%28, %26], %cst_0 {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %27, %21[%25, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %29, %21[%28, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%31 = memref.subview %20[%15, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%32 = memref.subview %21[0, %30] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%33 = memref.subview %14[%15, %30] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%34 = vector.transfer_read %31[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%35 = vector.transfer_read %31[%c0, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%36 = vector.transfer_read %31[%c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%37 = vector.transfer_read %31[%c1, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%38 = vector.transfer_read %31[%c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%39 = vector.transfer_read %31[%c2, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%40 = vector.transfer_read %31[%c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%41 = vector.transfer_read %31[%c3, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%42 = vector.transfer_read %31[%c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%43 = vector.transfer_read %31[%c4, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%44 = vector.transfer_read %31[%c5, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%45 = vector.transfer_read %31[%c5, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%46 = vector.transfer_read %31[%c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%47 = vector.transfer_read %31[%c6, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%48 = vector.transfer_read %31[%c7, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%49 = vector.transfer_read %31[%c7, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%50 = vector.transfer_read %31[%c8, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%51 = vector.transfer_read %31[%c8, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%52 = vector.transfer_read %31[%c9, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%53 = vector.transfer_read %31[%c9, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%54 = vector.transfer_read %31[%c10, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%55 = vector.transfer_read %31[%c10, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%56 = vector.transfer_read %31[%c11, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%57 = vector.transfer_read %31[%c11, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%58 = vector.transfer_read %31[%c12, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%59 = vector.transfer_read %31[%c12, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%60 = vector.transfer_read %31[%c13, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%61 = vector.transfer_read %31[%c13, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%62 = vector.transfer_read %31[%c14, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%63 = vector.transfer_read %31[%c14, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%64 = vector.transfer_read %31[%c15, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%65 = vector.transfer_read %31[%c15, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%66 = vector.transfer_read %32[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%67 = vector.transfer_read %32[%c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%68 = vector.transfer_read %32[%c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%69 = vector.transfer_read %32[%c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%70 = vector.transfer_read %32[%c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%71 = vector.transfer_read %32[%c5, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%72 = vector.transfer_read %32[%c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%73 = vector.transfer_read %32[%c7, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%74 = vector.transfer_read %33[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%75 = vector.transfer_read %33[%c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%76 = vector.transfer_read %33[%c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%77 = vector.transfer_read %33[%c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%78 = vector.transfer_read %33[%c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%79 = vector.transfer_read %33[%c5, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%80 = vector.transfer_read %33[%c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%81 = vector.transfer_read %33[%c7, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%82 = vector.transfer_read %33[%c8, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%83 = vector.transfer_read %33[%c9, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%84 = vector.transfer_read %33[%c10, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%85 = vector.transfer_read %33[%c11, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%86 = vector.transfer_read %33[%c12, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%87 = vector.transfer_read %33[%c13, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%88 = vector.transfer_read %33[%c14, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%89 = vector.transfer_read %33[%c15, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%90 = vector.extract_strided_slice %34 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%91 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%92 = vector.extract %90[0, 0] : vector<1x1xf32>
%93 = splat %92 : vector<4xf32>
%94 = vector.shape_cast %74 : vector<1x4xf32> to vector<4xf32>
%95 = vector.fma %93, %91, %94 : vector<4xf32>
%96 = vector.extract_strided_slice %34 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%97 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%98 = vector.extract %96[0, 0] : vector<1x1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.fma %99, %97, %95 : vector<4xf32>
%101 = vector.extract_strided_slice %34 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%102 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%103 = vector.extract %101[0, 0] : vector<1x1xf32>
%104 = splat %103 : vector<4xf32>
%105 = vector.fma %104, %102, %100 : vector<4xf32>
%106 = vector.extract_strided_slice %34 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%107 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%108 = vector.extract %106[0, 0] : vector<1x1xf32>
%109 = splat %108 : vector<4xf32>
%110 = vector.fma %109, %107, %105 : vector<4xf32>
%111 = vector.extract_strided_slice %35 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%112 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%113 = vector.extract %111[0, 0] : vector<1x1xf32>
%114 = splat %113 : vector<4xf32>
%115 = vector.fma %114, %112, %110 : vector<4xf32>
%116 = vector.extract_strided_slice %35 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%117 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%118 = vector.extract %116[0, 0] : vector<1x1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %117, %115 : vector<4xf32>
%121 = vector.extract_strided_slice %35 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%122 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%123 = vector.extract %121[0, 0] : vector<1x1xf32>
%124 = splat %123 : vector<4xf32>
%125 = vector.fma %124, %122, %120 : vector<4xf32>
%126 = vector.extract_strided_slice %35 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%127 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%128 = vector.extract %126[0, 0] : vector<1x1xf32>
%129 = splat %128 : vector<4xf32>
%130 = vector.fma %129, %127, %125 : vector<4xf32>
%131 = vector.shape_cast %130 : vector<4xf32> to vector<1x4xf32>
%132 = vector.extract_strided_slice %36 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%133 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%134 = vector.extract %132[0, 0] : vector<1x1xf32>
%135 = splat %134 : vector<4xf32>
%136 = vector.shape_cast %75 : vector<1x4xf32> to vector<4xf32>
%137 = vector.fma %135, %133, %136 : vector<4xf32>
%138 = vector.extract_strided_slice %36 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%139 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%140 = vector.extract %138[0, 0] : vector<1x1xf32>
%141 = splat %140 : vector<4xf32>
%142 = vector.fma %141, %139, %137 : vector<4xf32>
%143 = vector.extract_strided_slice %36 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%144 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%145 = vector.extract %143[0, 0] : vector<1x1xf32>
%146 = splat %145 : vector<4xf32>
%147 = vector.fma %146, %144, %142 : vector<4xf32>
%148 = vector.extract_strided_slice %36 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%149 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%150 = vector.extract %148[0, 0] : vector<1x1xf32>
%151 = splat %150 : vector<4xf32>
%152 = vector.fma %151, %149, %147 : vector<4xf32>
%153 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%154 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%155 = vector.extract %153[0, 0] : vector<1x1xf32>
%156 = splat %155 : vector<4xf32>
%157 = vector.fma %156, %154, %152 : vector<4xf32>
%158 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%159 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%160 = vector.extract %158[0, 0] : vector<1x1xf32>
%161 = splat %160 : vector<4xf32>
%162 = vector.fma %161, %159, %157 : vector<4xf32>
%163 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%164 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%165 = vector.extract %163[0, 0] : vector<1x1xf32>
%166 = splat %165 : vector<4xf32>
%167 = vector.fma %166, %164, %162 : vector<4xf32>
%168 = vector.extract_strided_slice %37 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%169 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%170 = vector.extract %168[0, 0] : vector<1x1xf32>
%171 = splat %170 : vector<4xf32>
%172 = vector.fma %171, %169, %167 : vector<4xf32>
%173 = vector.shape_cast %172 : vector<4xf32> to vector<1x4xf32>
%174 = vector.extract_strided_slice %38 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%175 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%176 = vector.extract %174[0, 0] : vector<1x1xf32>
%177 = splat %176 : vector<4xf32>
%178 = vector.shape_cast %76 : vector<1x4xf32> to vector<4xf32>
%179 = vector.fma %177, %175, %178 : vector<4xf32>
%180 = vector.extract_strided_slice %38 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%181 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%182 = vector.extract %180[0, 0] : vector<1x1xf32>
%183 = splat %182 : vector<4xf32>
%184 = vector.fma %183, %181, %179 : vector<4xf32>
%185 = vector.extract_strided_slice %38 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%186 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%187 = vector.extract %185[0, 0] : vector<1x1xf32>
%188 = splat %187 : vector<4xf32>
%189 = vector.fma %188, %186, %184 : vector<4xf32>
%190 = vector.extract_strided_slice %38 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%191 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%192 = vector.extract %190[0, 0] : vector<1x1xf32>
%193 = splat %192 : vector<4xf32>
%194 = vector.fma %193, %191, %189 : vector<4xf32>
%195 = vector.extract_strided_slice %39 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%196 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%197 = vector.extract %195[0, 0] : vector<1x1xf32>
%198 = splat %197 : vector<4xf32>
%199 = vector.fma %198, %196, %194 : vector<4xf32>
%200 = vector.extract_strided_slice %39 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%201 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%202 = vector.extract %200[0, 0] : vector<1x1xf32>
%203 = splat %202 : vector<4xf32>
%204 = vector.fma %203, %201, %199 : vector<4xf32>
%205 = vector.extract_strided_slice %39 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%206 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%207 = vector.extract %205[0, 0] : vector<1x1xf32>
%208 = splat %207 : vector<4xf32>
%209 = vector.fma %208, %206, %204 : vector<4xf32>
%210 = vector.extract_strided_slice %39 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%211 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%212 = vector.extract %210[0, 0] : vector<1x1xf32>
%213 = splat %212 : vector<4xf32>
%214 = vector.fma %213, %211, %209 : vector<4xf32>
%215 = vector.shape_cast %214 : vector<4xf32> to vector<1x4xf32>
%216 = vector.extract_strided_slice %40 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%217 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%218 = vector.extract %216[0, 0] : vector<1x1xf32>
%219 = splat %218 : vector<4xf32>
%220 = vector.shape_cast %77 : vector<1x4xf32> to vector<4xf32>
%221 = vector.fma %219, %217, %220 : vector<4xf32>
%222 = vector.extract_strided_slice %40 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%223 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%224 = vector.extract %222[0, 0] : vector<1x1xf32>
%225 = splat %224 : vector<4xf32>
%226 = vector.fma %225, %223, %221 : vector<4xf32>
%227 = vector.extract_strided_slice %40 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%228 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%229 = vector.extract %227[0, 0] : vector<1x1xf32>
%230 = splat %229 : vector<4xf32>
%231 = vector.fma %230, %228, %226 : vector<4xf32>
%232 = vector.extract_strided_slice %40 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%233 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%234 = vector.extract %232[0, 0] : vector<1x1xf32>
%235 = splat %234 : vector<4xf32>
%236 = vector.fma %235, %233, %231 : vector<4xf32>
%237 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%238 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%239 = vector.extract %237[0, 0] : vector<1x1xf32>
%240 = splat %239 : vector<4xf32>
%241 = vector.fma %240, %238, %236 : vector<4xf32>
%242 = vector.extract_strided_slice %41 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%243 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%244 = vector.extract %242[0, 0] : vector<1x1xf32>
%245 = splat %244 : vector<4xf32>
%246 = vector.fma %245, %243, %241 : vector<4xf32>
%247 = vector.extract_strided_slice %41 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%248 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%249 = vector.extract %247[0, 0] : vector<1x1xf32>
%250 = splat %249 : vector<4xf32>
%251 = vector.fma %250, %248, %246 : vector<4xf32>
%252 = vector.extract_strided_slice %41 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%253 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%254 = vector.extract %252[0, 0] : vector<1x1xf32>
%255 = splat %254 : vector<4xf32>
%256 = vector.fma %255, %253, %251 : vector<4xf32>
%257 = vector.shape_cast %256 : vector<4xf32> to vector<1x4xf32>
%258 = vector.extract_strided_slice %42 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%259 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%260 = vector.extract %258[0, 0] : vector<1x1xf32>
%261 = splat %260 : vector<4xf32>
%262 = vector.shape_cast %78 : vector<1x4xf32> to vector<4xf32>
%263 = vector.fma %261, %259, %262 : vector<4xf32>
%264 = vector.extract_strided_slice %42 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%265 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%266 = vector.extract %264[0, 0] : vector<1x1xf32>
%267 = splat %266 : vector<4xf32>
%268 = vector.fma %267, %265, %263 : vector<4xf32>
%269 = vector.extract_strided_slice %42 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%270 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%271 = vector.extract %269[0, 0] : vector<1x1xf32>
%272 = splat %271 : vector<4xf32>
%273 = vector.fma %272, %270, %268 : vector<4xf32>
%274 = vector.extract_strided_slice %42 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%275 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%276 = vector.extract %274[0, 0] : vector<1x1xf32>
%277 = splat %276 : vector<4xf32>
%278 = vector.fma %277, %275, %273 : vector<4xf32>
%279 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%280 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%281 = vector.extract %279[0, 0] : vector<1x1xf32>
%282 = splat %281 : vector<4xf32>
%283 = vector.fma %282, %280, %278 : vector<4xf32>
%284 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%285 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%286 = vector.extract %284[0, 0] : vector<1x1xf32>
%287 = splat %286 : vector<4xf32>
%288 = vector.fma %287, %285, %283 : vector<4xf32>
%289 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%290 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%291 = vector.extract %289[0, 0] : vector<1x1xf32>
%292 = splat %291 : vector<4xf32>
%293 = vector.fma %292, %290, %288 : vector<4xf32>
%294 = vector.extract_strided_slice %43 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%295 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%296 = vector.extract %294[0, 0] : vector<1x1xf32>
%297 = splat %296 : vector<4xf32>
%298 = vector.fma %297, %295, %293 : vector<4xf32>
%299 = vector.shape_cast %298 : vector<4xf32> to vector<1x4xf32>
%300 = vector.extract_strided_slice %44 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%301 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%302 = vector.extract %300[0, 0] : vector<1x1xf32>
%303 = splat %302 : vector<4xf32>
%304 = vector.shape_cast %79 : vector<1x4xf32> to vector<4xf32>
%305 = vector.fma %303, %301, %304 : vector<4xf32>
%306 = vector.extract_strided_slice %44 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%307 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%308 = vector.extract %306[0, 0] : vector<1x1xf32>
%309 = splat %308 : vector<4xf32>
%310 = vector.fma %309, %307, %305 : vector<4xf32>
%311 = vector.extract_strided_slice %44 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%312 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%313 = vector.extract %311[0, 0] : vector<1x1xf32>
%314 = splat %313 : vector<4xf32>
%315 = vector.fma %314, %312, %310 : vector<4xf32>
%316 = vector.extract_strided_slice %44 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%317 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%318 = vector.extract %316[0, 0] : vector<1x1xf32>
%319 = splat %318 : vector<4xf32>
%320 = vector.fma %319, %317, %315 : vector<4xf32>
%321 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%322 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%323 = vector.extract %321[0, 0] : vector<1x1xf32>
%324 = splat %323 : vector<4xf32>
%325 = vector.fma %324, %322, %320 : vector<4xf32>
%326 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%327 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%328 = vector.extract %326[0, 0] : vector<1x1xf32>
%329 = splat %328 : vector<4xf32>
%330 = vector.fma %329, %327, %325 : vector<4xf32>
%331 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%332 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%333 = vector.extract %331[0, 0] : vector<1x1xf32>
%334 = splat %333 : vector<4xf32>
%335 = vector.fma %334, %332, %330 : vector<4xf32>
%336 = vector.extract_strided_slice %45 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%337 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%338 = vector.extract %336[0, 0] : vector<1x1xf32>
%339 = splat %338 : vector<4xf32>
%340 = vector.fma %339, %337, %335 : vector<4xf32>
%341 = vector.shape_cast %340 : vector<4xf32> to vector<1x4xf32>
%342 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%343 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%344 = vector.extract %342[0, 0] : vector<1x1xf32>
%345 = splat %344 : vector<4xf32>
%346 = vector.shape_cast %80 : vector<1x4xf32> to vector<4xf32>
%347 = vector.fma %345, %343, %346 : vector<4xf32>
%348 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%349 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%350 = vector.extract %348[0, 0] : vector<1x1xf32>
%351 = splat %350 : vector<4xf32>
%352 = vector.fma %351, %349, %347 : vector<4xf32>
%353 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%354 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%355 = vector.extract %353[0, 0] : vector<1x1xf32>
%356 = splat %355 : vector<4xf32>
%357 = vector.fma %356, %354, %352 : vector<4xf32>
%358 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%359 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%360 = vector.extract %358[0, 0] : vector<1x1xf32>
%361 = splat %360 : vector<4xf32>
%362 = vector.fma %361, %359, %357 : vector<4xf32>
%363 = vector.extract_strided_slice %47 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%364 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%365 = vector.extract %363[0, 0] : vector<1x1xf32>
%366 = splat %365 : vector<4xf32>
%367 = vector.fma %366, %364, %362 : vector<4xf32>
%368 = vector.extract_strided_slice %47 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%369 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%370 = vector.extract %368[0, 0] : vector<1x1xf32>
%371 = splat %370 : vector<4xf32>
%372 = vector.fma %371, %369, %367 : vector<4xf32>
%373 = vector.extract_strided_slice %47 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%374 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%375 = vector.extract %373[0, 0] : vector<1x1xf32>
%376 = splat %375 : vector<4xf32>
%377 = vector.fma %376, %374, %372 : vector<4xf32>
%378 = vector.extract_strided_slice %47 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%379 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%380 = vector.extract %378[0, 0] : vector<1x1xf32>
%381 = splat %380 : vector<4xf32>
%382 = vector.fma %381, %379, %377 : vector<4xf32>
%383 = vector.shape_cast %382 : vector<4xf32> to vector<1x4xf32>
%384 = vector.extract_strided_slice %48 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%385 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%386 = vector.extract %384[0, 0] : vector<1x1xf32>
%387 = splat %386 : vector<4xf32>
%388 = vector.shape_cast %81 : vector<1x4xf32> to vector<4xf32>
%389 = vector.fma %387, %385, %388 : vector<4xf32>
%390 = vector.extract_strided_slice %48 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%391 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%392 = vector.extract %390[0, 0] : vector<1x1xf32>
%393 = splat %392 : vector<4xf32>
%394 = vector.fma %393, %391, %389 : vector<4xf32>
%395 = vector.extract_strided_slice %48 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%396 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%397 = vector.extract %395[0, 0] : vector<1x1xf32>
%398 = splat %397 : vector<4xf32>
%399 = vector.fma %398, %396, %394 : vector<4xf32>
%400 = vector.extract_strided_slice %48 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%401 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%402 = vector.extract %400[0, 0] : vector<1x1xf32>
%403 = splat %402 : vector<4xf32>
%404 = vector.fma %403, %401, %399 : vector<4xf32>
%405 = vector.extract_strided_slice %49 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%406 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%407 = vector.extract %405[0, 0] : vector<1x1xf32>
%408 = splat %407 : vector<4xf32>
%409 = vector.fma %408, %406, %404 : vector<4xf32>
%410 = vector.extract_strided_slice %49 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%411 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%412 = vector.extract %410[0, 0] : vector<1x1xf32>
%413 = splat %412 : vector<4xf32>
%414 = vector.fma %413, %411, %409 : vector<4xf32>
%415 = vector.extract_strided_slice %49 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%416 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%417 = vector.extract %415[0, 0] : vector<1x1xf32>
%418 = splat %417 : vector<4xf32>
%419 = vector.fma %418, %416, %414 : vector<4xf32>
%420 = vector.extract_strided_slice %49 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%421 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%422 = vector.extract %420[0, 0] : vector<1x1xf32>
%423 = splat %422 : vector<4xf32>
%424 = vector.fma %423, %421, %419 : vector<4xf32>
%425 = vector.shape_cast %424 : vector<4xf32> to vector<1x4xf32>
%426 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%427 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%428 = vector.extract %426[0, 0] : vector<1x1xf32>
%429 = splat %428 : vector<4xf32>
%430 = vector.shape_cast %82 : vector<1x4xf32> to vector<4xf32>
%431 = vector.fma %429, %427, %430 : vector<4xf32>
%432 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%433 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%434 = vector.extract %432[0, 0] : vector<1x1xf32>
%435 = splat %434 : vector<4xf32>
%436 = vector.fma %435, %433, %431 : vector<4xf32>
%437 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%438 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%439 = vector.extract %437[0, 0] : vector<1x1xf32>
%440 = splat %439 : vector<4xf32>
%441 = vector.fma %440, %438, %436 : vector<4xf32>
%442 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%443 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%444 = vector.extract %442[0, 0] : vector<1x1xf32>
%445 = splat %444 : vector<4xf32>
%446 = vector.fma %445, %443, %441 : vector<4xf32>
%447 = vector.extract_strided_slice %51 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%448 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%449 = vector.extract %447[0, 0] : vector<1x1xf32>
%450 = splat %449 : vector<4xf32>
%451 = vector.fma %450, %448, %446 : vector<4xf32>
%452 = vector.extract_strided_slice %51 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%453 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%454 = vector.extract %452[0, 0] : vector<1x1xf32>
%455 = splat %454 : vector<4xf32>
%456 = vector.fma %455, %453, %451 : vector<4xf32>
%457 = vector.extract_strided_slice %51 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%458 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%459 = vector.extract %457[0, 0] : vector<1x1xf32>
%460 = splat %459 : vector<4xf32>
%461 = vector.fma %460, %458, %456 : vector<4xf32>
%462 = vector.extract_strided_slice %51 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%463 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%464 = vector.extract %462[0, 0] : vector<1x1xf32>
%465 = splat %464 : vector<4xf32>
%466 = vector.fma %465, %463, %461 : vector<4xf32>
%467 = vector.shape_cast %466 : vector<4xf32> to vector<1x4xf32>
%468 = vector.extract_strided_slice %52 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%469 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%470 = vector.extract %468[0, 0] : vector<1x1xf32>
%471 = splat %470 : vector<4xf32>
%472 = vector.shape_cast %83 : vector<1x4xf32> to vector<4xf32>
%473 = vector.fma %471, %469, %472 : vector<4xf32>
%474 = vector.extract_strided_slice %52 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%475 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%476 = vector.extract %474[0, 0] : vector<1x1xf32>
%477 = splat %476 : vector<4xf32>
%478 = vector.fma %477, %475, %473 : vector<4xf32>
%479 = vector.extract_strided_slice %52 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%480 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%481 = vector.extract %479[0, 0] : vector<1x1xf32>
%482 = splat %481 : vector<4xf32>
%483 = vector.fma %482, %480, %478 : vector<4xf32>
%484 = vector.extract_strided_slice %52 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%485 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%486 = vector.extract %484[0, 0] : vector<1x1xf32>
%487 = splat %486 : vector<4xf32>
%488 = vector.fma %487, %485, %483 : vector<4xf32>
%489 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%490 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%491 = vector.extract %489[0, 0] : vector<1x1xf32>
%492 = splat %491 : vector<4xf32>
%493 = vector.fma %492, %490, %488 : vector<4xf32>
%494 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%495 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%496 = vector.extract %494[0, 0] : vector<1x1xf32>
%497 = splat %496 : vector<4xf32>
%498 = vector.fma %497, %495, %493 : vector<4xf32>
%499 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%500 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%501 = vector.extract %499[0, 0] : vector<1x1xf32>
%502 = splat %501 : vector<4xf32>
%503 = vector.fma %502, %500, %498 : vector<4xf32>
%504 = vector.extract_strided_slice %53 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%505 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%506 = vector.extract %504[0, 0] : vector<1x1xf32>
%507 = splat %506 : vector<4xf32>
%508 = vector.fma %507, %505, %503 : vector<4xf32>
%509 = vector.shape_cast %508 : vector<4xf32> to vector<1x4xf32>
%510 = vector.extract_strided_slice %54 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%511 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%512 = vector.extract %510[0, 0] : vector<1x1xf32>
%513 = splat %512 : vector<4xf32>
%514 = vector.shape_cast %84 : vector<1x4xf32> to vector<4xf32>
%515 = vector.fma %513, %511, %514 : vector<4xf32>
%516 = vector.extract_strided_slice %54 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%517 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%518 = vector.extract %516[0, 0] : vector<1x1xf32>
%519 = splat %518 : vector<4xf32>
%520 = vector.fma %519, %517, %515 : vector<4xf32>
%521 = vector.extract_strided_slice %54 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%522 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%523 = vector.extract %521[0, 0] : vector<1x1xf32>
%524 = splat %523 : vector<4xf32>
%525 = vector.fma %524, %522, %520 : vector<4xf32>
%526 = vector.extract_strided_slice %54 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%527 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%528 = vector.extract %526[0, 0] : vector<1x1xf32>
%529 = splat %528 : vector<4xf32>
%530 = vector.fma %529, %527, %525 : vector<4xf32>
%531 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%532 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%533 = vector.extract %531[0, 0] : vector<1x1xf32>
%534 = splat %533 : vector<4xf32>
%535 = vector.fma %534, %532, %530 : vector<4xf32>
%536 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%537 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%538 = vector.extract %536[0, 0] : vector<1x1xf32>
%539 = splat %538 : vector<4xf32>
%540 = vector.fma %539, %537, %535 : vector<4xf32>
%541 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%542 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%543 = vector.extract %541[0, 0] : vector<1x1xf32>
%544 = splat %543 : vector<4xf32>
%545 = vector.fma %544, %542, %540 : vector<4xf32>
%546 = vector.extract_strided_slice %55 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%547 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%548 = vector.extract %546[0, 0] : vector<1x1xf32>
%549 = splat %548 : vector<4xf32>
%550 = vector.fma %549, %547, %545 : vector<4xf32>
%551 = vector.shape_cast %550 : vector<4xf32> to vector<1x4xf32>
%552 = vector.extract_strided_slice %56 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%553 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%554 = vector.extract %552[0, 0] : vector<1x1xf32>
%555 = splat %554 : vector<4xf32>
%556 = vector.shape_cast %85 : vector<1x4xf32> to vector<4xf32>
%557 = vector.fma %555, %553, %556 : vector<4xf32>
%558 = vector.extract_strided_slice %56 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%559 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%560 = vector.extract %558[0, 0] : vector<1x1xf32>
%561 = splat %560 : vector<4xf32>
%562 = vector.fma %561, %559, %557 : vector<4xf32>
%563 = vector.extract_strided_slice %56 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%564 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%565 = vector.extract %563[0, 0] : vector<1x1xf32>
%566 = splat %565 : vector<4xf32>
%567 = vector.fma %566, %564, %562 : vector<4xf32>
%568 = vector.extract_strided_slice %56 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%569 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%570 = vector.extract %568[0, 0] : vector<1x1xf32>
%571 = splat %570 : vector<4xf32>
%572 = vector.fma %571, %569, %567 : vector<4xf32>
%573 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%574 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%575 = vector.extract %573[0, 0] : vector<1x1xf32>
%576 = splat %575 : vector<4xf32>
%577 = vector.fma %576, %574, %572 : vector<4xf32>
%578 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%579 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%580 = vector.extract %578[0, 0] : vector<1x1xf32>
%581 = splat %580 : vector<4xf32>
%582 = vector.fma %581, %579, %577 : vector<4xf32>
%583 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%584 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%585 = vector.extract %583[0, 0] : vector<1x1xf32>
%586 = splat %585 : vector<4xf32>
%587 = vector.fma %586, %584, %582 : vector<4xf32>
%588 = vector.extract_strided_slice %57 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%589 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%590 = vector.extract %588[0, 0] : vector<1x1xf32>
%591 = splat %590 : vector<4xf32>
%592 = vector.fma %591, %589, %587 : vector<4xf32>
%593 = vector.shape_cast %592 : vector<4xf32> to vector<1x4xf32>
%594 = vector.extract_strided_slice %58 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%595 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%596 = vector.extract %594[0, 0] : vector<1x1xf32>
%597 = splat %596 : vector<4xf32>
%598 = vector.shape_cast %86 : vector<1x4xf32> to vector<4xf32>
%599 = vector.fma %597, %595, %598 : vector<4xf32>
%600 = vector.extract_strided_slice %58 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%601 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%602 = vector.extract %600[0, 0] : vector<1x1xf32>
%603 = splat %602 : vector<4xf32>
%604 = vector.fma %603, %601, %599 : vector<4xf32>
%605 = vector.extract_strided_slice %58 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%606 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%607 = vector.extract %605[0, 0] : vector<1x1xf32>
%608 = splat %607 : vector<4xf32>
%609 = vector.fma %608, %606, %604 : vector<4xf32>
%610 = vector.extract_strided_slice %58 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%611 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%612 = vector.extract %610[0, 0] : vector<1x1xf32>
%613 = splat %612 : vector<4xf32>
%614 = vector.fma %613, %611, %609 : vector<4xf32>
%615 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%616 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%617 = vector.extract %615[0, 0] : vector<1x1xf32>
%618 = splat %617 : vector<4xf32>
%619 = vector.fma %618, %616, %614 : vector<4xf32>
%620 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%621 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%622 = vector.extract %620[0, 0] : vector<1x1xf32>
%623 = splat %622 : vector<4xf32>
%624 = vector.fma %623, %621, %619 : vector<4xf32>
%625 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%626 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%627 = vector.extract %625[0, 0] : vector<1x1xf32>
%628 = splat %627 : vector<4xf32>
%629 = vector.fma %628, %626, %624 : vector<4xf32>
%630 = vector.extract_strided_slice %59 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%631 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%632 = vector.extract %630[0, 0] : vector<1x1xf32>
%633 = splat %632 : vector<4xf32>
%634 = vector.fma %633, %631, %629 : vector<4xf32>
%635 = vector.shape_cast %634 : vector<4xf32> to vector<1x4xf32>
%636 = vector.extract_strided_slice %60 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%637 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%638 = vector.extract %636[0, 0] : vector<1x1xf32>
%639 = splat %638 : vector<4xf32>
%640 = vector.shape_cast %87 : vector<1x4xf32> to vector<4xf32>
%641 = vector.fma %639, %637, %640 : vector<4xf32>
%642 = vector.extract_strided_slice %60 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%643 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%644 = vector.extract %642[0, 0] : vector<1x1xf32>
%645 = splat %644 : vector<4xf32>
%646 = vector.fma %645, %643, %641 : vector<4xf32>
%647 = vector.extract_strided_slice %60 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%648 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%649 = vector.extract %647[0, 0] : vector<1x1xf32>
%650 = splat %649 : vector<4xf32>
%651 = vector.fma %650, %648, %646 : vector<4xf32>
%652 = vector.extract_strided_slice %60 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%653 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%654 = vector.extract %652[0, 0] : vector<1x1xf32>
%655 = splat %654 : vector<4xf32>
%656 = vector.fma %655, %653, %651 : vector<4xf32>
%657 = vector.extract_strided_slice %61 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%658 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%659 = vector.extract %657[0, 0] : vector<1x1xf32>
%660 = splat %659 : vector<4xf32>
%661 = vector.fma %660, %658, %656 : vector<4xf32>
%662 = vector.extract_strided_slice %61 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%663 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%664 = vector.extract %662[0, 0] : vector<1x1xf32>
%665 = splat %664 : vector<4xf32>
%666 = vector.fma %665, %663, %661 : vector<4xf32>
%667 = vector.extract_strided_slice %61 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%668 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%669 = vector.extract %667[0, 0] : vector<1x1xf32>
%670 = splat %669 : vector<4xf32>
%671 = vector.fma %670, %668, %666 : vector<4xf32>
%672 = vector.extract_strided_slice %61 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%673 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%674 = vector.extract %672[0, 0] : vector<1x1xf32>
%675 = splat %674 : vector<4xf32>
%676 = vector.fma %675, %673, %671 : vector<4xf32>
%677 = vector.shape_cast %676 : vector<4xf32> to vector<1x4xf32>
%678 = vector.extract_strided_slice %62 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%679 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%680 = vector.extract %678[0, 0] : vector<1x1xf32>
%681 = splat %680 : vector<4xf32>
%682 = vector.shape_cast %88 : vector<1x4xf32> to vector<4xf32>
%683 = vector.fma %681, %679, %682 : vector<4xf32>
%684 = vector.extract_strided_slice %62 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%685 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%686 = vector.extract %684[0, 0] : vector<1x1xf32>
%687 = splat %686 : vector<4xf32>
%688 = vector.fma %687, %685, %683 : vector<4xf32>
%689 = vector.extract_strided_slice %62 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%690 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%691 = vector.extract %689[0, 0] : vector<1x1xf32>
%692 = splat %691 : vector<4xf32>
%693 = vector.fma %692, %690, %688 : vector<4xf32>
%694 = vector.extract_strided_slice %62 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%695 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%696 = vector.extract %694[0, 0] : vector<1x1xf32>
%697 = splat %696 : vector<4xf32>
%698 = vector.fma %697, %695, %693 : vector<4xf32>
%699 = vector.extract_strided_slice %63 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%700 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%701 = vector.extract %699[0, 0] : vector<1x1xf32>
%702 = splat %701 : vector<4xf32>
%703 = vector.fma %702, %700, %698 : vector<4xf32>
%704 = vector.extract_strided_slice %63 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%705 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%706 = vector.extract %704[0, 0] : vector<1x1xf32>
%707 = splat %706 : vector<4xf32>
%708 = vector.fma %707, %705, %703 : vector<4xf32>
%709 = vector.extract_strided_slice %63 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%710 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%711 = vector.extract %709[0, 0] : vector<1x1xf32>
%712 = splat %711 : vector<4xf32>
%713 = vector.fma %712, %710, %708 : vector<4xf32>
%714 = vector.extract_strided_slice %63 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%715 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%716 = vector.extract %714[0, 0] : vector<1x1xf32>
%717 = splat %716 : vector<4xf32>
%718 = vector.fma %717, %715, %713 : vector<4xf32>
%719 = vector.shape_cast %718 : vector<4xf32> to vector<1x4xf32>
%720 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%721 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%722 = vector.extract %720[0, 0] : vector<1x1xf32>
%723 = splat %722 : vector<4xf32>
%724 = vector.shape_cast %89 : vector<1x4xf32> to vector<4xf32>
%725 = vector.fma %723, %721, %724 : vector<4xf32>
%726 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%727 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%728 = vector.extract %726[0, 0] : vector<1x1xf32>
%729 = splat %728 : vector<4xf32>
%730 = vector.fma %729, %727, %725 : vector<4xf32>
%731 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%732 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%733 = vector.extract %731[0, 0] : vector<1x1xf32>
%734 = splat %733 : vector<4xf32>
%735 = vector.fma %734, %732, %730 : vector<4xf32>
%736 = vector.extract_strided_slice %64 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%737 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%738 = vector.extract %736[0, 0] : vector<1x1xf32>
%739 = splat %738 : vector<4xf32>
%740 = vector.fma %739, %737, %735 : vector<4xf32>
%741 = vector.extract_strided_slice %65 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%742 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%743 = vector.extract %741[0, 0] : vector<1x1xf32>
%744 = splat %743 : vector<4xf32>
%745 = vector.fma %744, %742, %740 : vector<4xf32>
%746 = vector.extract_strided_slice %65 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%747 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%748 = vector.extract %746[0, 0] : vector<1x1xf32>
%749 = splat %748 : vector<4xf32>
%750 = vector.fma %749, %747, %745 : vector<4xf32>
%751 = vector.extract_strided_slice %65 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%752 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%753 = vector.extract %751[0, 0] : vector<1x1xf32>
%754 = splat %753 : vector<4xf32>
%755 = vector.fma %754, %752, %750 : vector<4xf32>
%756 = vector.extract_strided_slice %65 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%757 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%758 = vector.extract %756[0, 0] : vector<1x1xf32>
%759 = splat %758 : vector<4xf32>
%760 = vector.fma %759, %757, %755 : vector<4xf32>
%761 = vector.shape_cast %760 : vector<4xf32> to vector<1x4xf32>
vector.transfer_write %131, %33[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %173, %33[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %215, %33[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %257, %33[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %299, %33[%c4, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %341, %33[%c5, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %383, %33[%c6, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %425, %33[%c7, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %467, %33[%c8, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %509, %33[%c9, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %551, %33[%c10, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %593, %33[%c11, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %635, %33[%c12, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %677, %33[%c13, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %719, %33[%c14, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %761, %33[%c15, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @_large_aligned_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c15 = constant 15 : index
%c14 = constant 14 : index
%c13 = constant 13 : index
%c12 = constant 12 : index
%c11 = constant 11 : index
%c10 = constant 10 : index
%c9 = constant 9 : index
%c7 = constant 7 : index
%c6 = constant 6 : index
%c5 = constant 5 : index
%c4 = constant 4 : index
%c3 = constant 3 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst_0 = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1024 = constant 1024 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %8 to %c2048 step %9 {
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg1 = %10 to %c512 step %11 {
%12 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%14 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%15 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%17 = memref.subview %14[%15, %16] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c4, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c5, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c6, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c7, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c8, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c9, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c10, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c11, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c12, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c13, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c14, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %cst, %17[%c15, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
scf.for %arg2 = %c0 to %c1024 step %c8 {
%18 = memref.subview %12[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%19 = memref.subview %13[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%20 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%21 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%24 = vector.transfer_read %18[%22, %23], %cst_0 {in_bounds = [true, true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %24, %20[%22, %23] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%27 = vector.transfer_read %19[%25, %26], %cst_0 {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%28 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%29 = vector.transfer_read %19[%28, %26], %cst_0 {in_bounds = [true, true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
vector.transfer_write %27, %21[%25, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %29, %21[%28, %26] {in_bounds = [true, true]} : vector<1x4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%30 = memref.subview %20[%15, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%31 = memref.subview %21[0, %16] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
%32 = vector.transfer_read %30[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%33 = vector.transfer_read %30[%c0, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%34 = vector.transfer_read %30[%c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%35 = vector.transfer_read %30[%c1, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%36 = vector.transfer_read %30[%c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%37 = vector.transfer_read %30[%c2, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%38 = vector.transfer_read %30[%c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%39 = vector.transfer_read %30[%c3, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%40 = vector.transfer_read %30[%c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%41 = vector.transfer_read %30[%c4, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%42 = vector.transfer_read %30[%c5, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%43 = vector.transfer_read %30[%c5, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%44 = vector.transfer_read %30[%c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%45 = vector.transfer_read %30[%c6, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%46 = vector.transfer_read %30[%c7, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%47 = vector.transfer_read %30[%c7, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%48 = vector.transfer_read %30[%c8, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%49 = vector.transfer_read %30[%c8, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%50 = vector.transfer_read %30[%c9, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%51 = vector.transfer_read %30[%c9, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%52 = vector.transfer_read %30[%c10, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%53 = vector.transfer_read %30[%c10, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%54 = vector.transfer_read %30[%c11, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%55 = vector.transfer_read %30[%c11, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%56 = vector.transfer_read %30[%c12, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%57 = vector.transfer_read %30[%c12, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%58 = vector.transfer_read %30[%c13, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%59 = vector.transfer_read %30[%c13, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%60 = vector.transfer_read %30[%c14, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%61 = vector.transfer_read %30[%c14, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%62 = vector.transfer_read %30[%c15, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%63 = vector.transfer_read %30[%c15, %c4], %cst_0 {in_bounds = [true, true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<1x4xf32>
%64 = vector.transfer_read %31[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%65 = vector.transfer_read %31[%c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%66 = vector.transfer_read %31[%c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%67 = vector.transfer_read %31[%c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%68 = vector.transfer_read %31[%c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%69 = vector.transfer_read %31[%c5, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%70 = vector.transfer_read %31[%c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%71 = vector.transfer_read %31[%c7, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<1x4xf32>
%72 = vector.transfer_read %17[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%73 = vector.transfer_read %17[%c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%74 = vector.transfer_read %17[%c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%75 = vector.transfer_read %17[%c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%76 = vector.transfer_read %17[%c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%77 = vector.transfer_read %17[%c5, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%78 = vector.transfer_read %17[%c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%79 = vector.transfer_read %17[%c7, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%80 = vector.transfer_read %17[%c8, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%81 = vector.transfer_read %17[%c9, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%82 = vector.transfer_read %17[%c10, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%83 = vector.transfer_read %17[%c11, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%84 = vector.transfer_read %17[%c12, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%85 = vector.transfer_read %17[%c13, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%86 = vector.transfer_read %17[%c14, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%87 = vector.transfer_read %17[%c15, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<1x4xf32>
%88 = vector.extract_strided_slice %32 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%89 = vector.shape_cast %64 : vector<1x4xf32> to vector<4xf32>
%90 = vector.extract %88[0, 0] : vector<1x1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.shape_cast %72 : vector<1x4xf32> to vector<4xf32>
%93 = vector.fma %91, %89, %92 : vector<4xf32>
%94 = vector.extract_strided_slice %32 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%95 = vector.shape_cast %65 : vector<1x4xf32> to vector<4xf32>
%96 = vector.extract %94[0, 0] : vector<1x1xf32>
%97 = splat %96 : vector<4xf32>
%98 = vector.fma %97, %95, %93 : vector<4xf32>
%99 = vector.extract_strided_slice %32 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%100 = vector.shape_cast %66 : vector<1x4xf32> to vector<4xf32>
%101 = vector.extract %99[0, 0] : vector<1x1xf32>
%102 = splat %101 : vector<4xf32>
%103 = vector.fma %102, %100, %98 : vector<4xf32>
%104 = vector.extract_strided_slice %32 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%105 = vector.shape_cast %67 : vector<1x4xf32> to vector<4xf32>
%106 = vector.extract %104[0, 0] : vector<1x1xf32>
%107 = splat %106 : vector<4xf32>
%108 = vector.fma %107, %105, %103 : vector<4xf32>
%109 = vector.extract_strided_slice %33 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%110 = vector.shape_cast %68 : vector<1x4xf32> to vector<4xf32>
%111 = vector.extract %109[0, 0] : vector<1x1xf32>
%112 = splat %111 : vector<4xf32>
%113 = vector.fma %112, %110, %108 : vector<4xf32>
%114 = vector.extract_strided_slice %33 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%115 = vector.shape_cast %69 : vector<1x4xf32> to vector<4xf32>
%116 = vector.extract %114[0, 0] : vector<1x1xf32>
%117 = splat %116 : vector<4xf32>
%118 = vector.fma %117, %115, %113 : vector<4xf32>
%119 = vector.extract_strided_slice %33 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%120 = vector.shape_cast %70 : vector<1x4xf32> to vector<4xf32>
%121 = vector.extract %119[0, 0] : vector<1x1xf32>
%122 = splat %121 : vector<4xf32>
%123 = vector.fma %122, %120, %118 : vector<4xf32>
%124 = vector.extract_strided_slice %33 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%125 = vector.shape_cast %71 : vector<1x4xf32> to vector<4xf32>
%126 = vector.extract %124[0, 0] : vector<1x1xf32>
%127 = splat %126 : vector<4xf32>
%128 = vector.fma %127, %125, %123 : vector<4xf32>
%129 = vector.shape_cast %128 : vector<4xf32> to vector<1x4xf32>
%130 = vector.extract_strided_slice %34 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%131 = vector.extract %130[0, 0] : vector<1x1xf32>
%132 = splat %131 : vector<4xf32>
%133 = vector.shape_cast %73 : vector<1x4xf32> to vector<4xf32>
%134 = vector.fma %132, %89, %133 : vector<4xf32>
%135 = vector.extract_strided_slice %34 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%136 = vector.extract %135[0, 0] : vector<1x1xf32>
%137 = splat %136 : vector<4xf32>
%138 = vector.fma %137, %95, %134 : vector<4xf32>
%139 = vector.extract_strided_slice %34 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%140 = vector.extract %139[0, 0] : vector<1x1xf32>
%141 = splat %140 : vector<4xf32>
%142 = vector.fma %141, %100, %138 : vector<4xf32>
%143 = vector.extract_strided_slice %34 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%144 = vector.extract %143[0, 0] : vector<1x1xf32>
%145 = splat %144 : vector<4xf32>
%146 = vector.fma %145, %105, %142 : vector<4xf32>
%147 = vector.extract_strided_slice %35 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%148 = vector.extract %147[0, 0] : vector<1x1xf32>
%149 = splat %148 : vector<4xf32>
%150 = vector.fma %149, %110, %146 : vector<4xf32>
%151 = vector.extract_strided_slice %35 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%152 = vector.extract %151[0, 0] : vector<1x1xf32>
%153 = splat %152 : vector<4xf32>
%154 = vector.fma %153, %115, %150 : vector<4xf32>
%155 = vector.extract_strided_slice %35 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%156 = vector.extract %155[0, 0] : vector<1x1xf32>
%157 = splat %156 : vector<4xf32>
%158 = vector.fma %157, %120, %154 : vector<4xf32>
%159 = vector.extract_strided_slice %35 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%160 = vector.extract %159[0, 0] : vector<1x1xf32>
%161 = splat %160 : vector<4xf32>
%162 = vector.fma %161, %125, %158 : vector<4xf32>
%163 = vector.shape_cast %162 : vector<4xf32> to vector<1x4xf32>
%164 = vector.extract_strided_slice %36 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%165 = vector.extract %164[0, 0] : vector<1x1xf32>
%166 = splat %165 : vector<4xf32>
%167 = vector.shape_cast %74 : vector<1x4xf32> to vector<4xf32>
%168 = vector.fma %166, %89, %167 : vector<4xf32>
%169 = vector.extract_strided_slice %36 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%170 = vector.extract %169[0, 0] : vector<1x1xf32>
%171 = splat %170 : vector<4xf32>
%172 = vector.fma %171, %95, %168 : vector<4xf32>
%173 = vector.extract_strided_slice %36 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%174 = vector.extract %173[0, 0] : vector<1x1xf32>
%175 = splat %174 : vector<4xf32>
%176 = vector.fma %175, %100, %172 : vector<4xf32>
%177 = vector.extract_strided_slice %36 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%178 = vector.extract %177[0, 0] : vector<1x1xf32>
%179 = splat %178 : vector<4xf32>
%180 = vector.fma %179, %105, %176 : vector<4xf32>
%181 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%182 = vector.extract %181[0, 0] : vector<1x1xf32>
%183 = splat %182 : vector<4xf32>
%184 = vector.fma %183, %110, %180 : vector<4xf32>
%185 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%186 = vector.extract %185[0, 0] : vector<1x1xf32>
%187 = splat %186 : vector<4xf32>
%188 = vector.fma %187, %115, %184 : vector<4xf32>
%189 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%190 = vector.extract %189[0, 0] : vector<1x1xf32>
%191 = splat %190 : vector<4xf32>
%192 = vector.fma %191, %120, %188 : vector<4xf32>
%193 = vector.extract_strided_slice %37 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%194 = vector.extract %193[0, 0] : vector<1x1xf32>
%195 = splat %194 : vector<4xf32>
%196 = vector.fma %195, %125, %192 : vector<4xf32>
%197 = vector.shape_cast %196 : vector<4xf32> to vector<1x4xf32>
%198 = vector.extract_strided_slice %38 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%199 = vector.extract %198[0, 0] : vector<1x1xf32>
%200 = splat %199 : vector<4xf32>
%201 = vector.shape_cast %75 : vector<1x4xf32> to vector<4xf32>
%202 = vector.fma %200, %89, %201 : vector<4xf32>
%203 = vector.extract_strided_slice %38 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%204 = vector.extract %203[0, 0] : vector<1x1xf32>
%205 = splat %204 : vector<4xf32>
%206 = vector.fma %205, %95, %202 : vector<4xf32>
%207 = vector.extract_strided_slice %38 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%208 = vector.extract %207[0, 0] : vector<1x1xf32>
%209 = splat %208 : vector<4xf32>
%210 = vector.fma %209, %100, %206 : vector<4xf32>
%211 = vector.extract_strided_slice %38 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%212 = vector.extract %211[0, 0] : vector<1x1xf32>
%213 = splat %212 : vector<4xf32>
%214 = vector.fma %213, %105, %210 : vector<4xf32>
%215 = vector.extract_strided_slice %39 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%216 = vector.extract %215[0, 0] : vector<1x1xf32>
%217 = splat %216 : vector<4xf32>
%218 = vector.fma %217, %110, %214 : vector<4xf32>
%219 = vector.extract_strided_slice %39 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%220 = vector.extract %219[0, 0] : vector<1x1xf32>
%221 = splat %220 : vector<4xf32>
%222 = vector.fma %221, %115, %218 : vector<4xf32>
%223 = vector.extract_strided_slice %39 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%224 = vector.extract %223[0, 0] : vector<1x1xf32>
%225 = splat %224 : vector<4xf32>
%226 = vector.fma %225, %120, %222 : vector<4xf32>
%227 = vector.extract_strided_slice %39 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%228 = vector.extract %227[0, 0] : vector<1x1xf32>
%229 = splat %228 : vector<4xf32>
%230 = vector.fma %229, %125, %226 : vector<4xf32>
%231 = vector.shape_cast %230 : vector<4xf32> to vector<1x4xf32>
%232 = vector.extract_strided_slice %40 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%233 = vector.extract %232[0, 0] : vector<1x1xf32>
%234 = splat %233 : vector<4xf32>
%235 = vector.shape_cast %76 : vector<1x4xf32> to vector<4xf32>
%236 = vector.fma %234, %89, %235 : vector<4xf32>
%237 = vector.extract_strided_slice %40 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%238 = vector.extract %237[0, 0] : vector<1x1xf32>
%239 = splat %238 : vector<4xf32>
%240 = vector.fma %239, %95, %236 : vector<4xf32>
%241 = vector.extract_strided_slice %40 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%242 = vector.extract %241[0, 0] : vector<1x1xf32>
%243 = splat %242 : vector<4xf32>
%244 = vector.fma %243, %100, %240 : vector<4xf32>
%245 = vector.extract_strided_slice %40 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%246 = vector.extract %245[0, 0] : vector<1x1xf32>
%247 = splat %246 : vector<4xf32>
%248 = vector.fma %247, %105, %244 : vector<4xf32>
%249 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%250 = vector.extract %249[0, 0] : vector<1x1xf32>
%251 = splat %250 : vector<4xf32>
%252 = vector.fma %251, %110, %248 : vector<4xf32>
%253 = vector.extract_strided_slice %41 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%254 = vector.extract %253[0, 0] : vector<1x1xf32>
%255 = splat %254 : vector<4xf32>
%256 = vector.fma %255, %115, %252 : vector<4xf32>
%257 = vector.extract_strided_slice %41 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%258 = vector.extract %257[0, 0] : vector<1x1xf32>
%259 = splat %258 : vector<4xf32>
%260 = vector.fma %259, %120, %256 : vector<4xf32>
%261 = vector.extract_strided_slice %41 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%262 = vector.extract %261[0, 0] : vector<1x1xf32>
%263 = splat %262 : vector<4xf32>
%264 = vector.fma %263, %125, %260 : vector<4xf32>
%265 = vector.shape_cast %264 : vector<4xf32> to vector<1x4xf32>
%266 = vector.extract_strided_slice %42 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%267 = vector.extract %266[0, 0] : vector<1x1xf32>
%268 = splat %267 : vector<4xf32>
%269 = vector.shape_cast %77 : vector<1x4xf32> to vector<4xf32>
%270 = vector.fma %268, %89, %269 : vector<4xf32>
%271 = vector.extract_strided_slice %42 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%272 = vector.extract %271[0, 0] : vector<1x1xf32>
%273 = splat %272 : vector<4xf32>
%274 = vector.fma %273, %95, %270 : vector<4xf32>
%275 = vector.extract_strided_slice %42 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%276 = vector.extract %275[0, 0] : vector<1x1xf32>
%277 = splat %276 : vector<4xf32>
%278 = vector.fma %277, %100, %274 : vector<4xf32>
%279 = vector.extract_strided_slice %42 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%280 = vector.extract %279[0, 0] : vector<1x1xf32>
%281 = splat %280 : vector<4xf32>
%282 = vector.fma %281, %105, %278 : vector<4xf32>
%283 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%284 = vector.extract %283[0, 0] : vector<1x1xf32>
%285 = splat %284 : vector<4xf32>
%286 = vector.fma %285, %110, %282 : vector<4xf32>
%287 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%288 = vector.extract %287[0, 0] : vector<1x1xf32>
%289 = splat %288 : vector<4xf32>
%290 = vector.fma %289, %115, %286 : vector<4xf32>
%291 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%292 = vector.extract %291[0, 0] : vector<1x1xf32>
%293 = splat %292 : vector<4xf32>
%294 = vector.fma %293, %120, %290 : vector<4xf32>
%295 = vector.extract_strided_slice %43 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%296 = vector.extract %295[0, 0] : vector<1x1xf32>
%297 = splat %296 : vector<4xf32>
%298 = vector.fma %297, %125, %294 : vector<4xf32>
%299 = vector.shape_cast %298 : vector<4xf32> to vector<1x4xf32>
%300 = vector.extract_strided_slice %44 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%301 = vector.extract %300[0, 0] : vector<1x1xf32>
%302 = splat %301 : vector<4xf32>
%303 = vector.shape_cast %78 : vector<1x4xf32> to vector<4xf32>
%304 = vector.fma %302, %89, %303 : vector<4xf32>
%305 = vector.extract_strided_slice %44 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%306 = vector.extract %305[0, 0] : vector<1x1xf32>
%307 = splat %306 : vector<4xf32>
%308 = vector.fma %307, %95, %304 : vector<4xf32>
%309 = vector.extract_strided_slice %44 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%310 = vector.extract %309[0, 0] : vector<1x1xf32>
%311 = splat %310 : vector<4xf32>
%312 = vector.fma %311, %100, %308 : vector<4xf32>
%313 = vector.extract_strided_slice %44 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%314 = vector.extract %313[0, 0] : vector<1x1xf32>
%315 = splat %314 : vector<4xf32>
%316 = vector.fma %315, %105, %312 : vector<4xf32>
%317 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%318 = vector.extract %317[0, 0] : vector<1x1xf32>
%319 = splat %318 : vector<4xf32>
%320 = vector.fma %319, %110, %316 : vector<4xf32>
%321 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%322 = vector.extract %321[0, 0] : vector<1x1xf32>
%323 = splat %322 : vector<4xf32>
%324 = vector.fma %323, %115, %320 : vector<4xf32>
%325 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%326 = vector.extract %325[0, 0] : vector<1x1xf32>
%327 = splat %326 : vector<4xf32>
%328 = vector.fma %327, %120, %324 : vector<4xf32>
%329 = vector.extract_strided_slice %45 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%330 = vector.extract %329[0, 0] : vector<1x1xf32>
%331 = splat %330 : vector<4xf32>
%332 = vector.fma %331, %125, %328 : vector<4xf32>
%333 = vector.shape_cast %332 : vector<4xf32> to vector<1x4xf32>
%334 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%335 = vector.extract %334[0, 0] : vector<1x1xf32>
%336 = splat %335 : vector<4xf32>
%337 = vector.shape_cast %79 : vector<1x4xf32> to vector<4xf32>
%338 = vector.fma %336, %89, %337 : vector<4xf32>
%339 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%340 = vector.extract %339[0, 0] : vector<1x1xf32>
%341 = splat %340 : vector<4xf32>
%342 = vector.fma %341, %95, %338 : vector<4xf32>
%343 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%344 = vector.extract %343[0, 0] : vector<1x1xf32>
%345 = splat %344 : vector<4xf32>
%346 = vector.fma %345, %100, %342 : vector<4xf32>
%347 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%348 = vector.extract %347[0, 0] : vector<1x1xf32>
%349 = splat %348 : vector<4xf32>
%350 = vector.fma %349, %105, %346 : vector<4xf32>
%351 = vector.extract_strided_slice %47 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%352 = vector.extract %351[0, 0] : vector<1x1xf32>
%353 = splat %352 : vector<4xf32>
%354 = vector.fma %353, %110, %350 : vector<4xf32>
%355 = vector.extract_strided_slice %47 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%356 = vector.extract %355[0, 0] : vector<1x1xf32>
%357 = splat %356 : vector<4xf32>
%358 = vector.fma %357, %115, %354 : vector<4xf32>
%359 = vector.extract_strided_slice %47 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%360 = vector.extract %359[0, 0] : vector<1x1xf32>
%361 = splat %360 : vector<4xf32>
%362 = vector.fma %361, %120, %358 : vector<4xf32>
%363 = vector.extract_strided_slice %47 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%364 = vector.extract %363[0, 0] : vector<1x1xf32>
%365 = splat %364 : vector<4xf32>
%366 = vector.fma %365, %125, %362 : vector<4xf32>
%367 = vector.shape_cast %366 : vector<4xf32> to vector<1x4xf32>
%368 = vector.extract_strided_slice %48 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%369 = vector.extract %368[0, 0] : vector<1x1xf32>
%370 = splat %369 : vector<4xf32>
%371 = vector.shape_cast %80 : vector<1x4xf32> to vector<4xf32>
%372 = vector.fma %370, %89, %371 : vector<4xf32>
%373 = vector.extract_strided_slice %48 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%374 = vector.extract %373[0, 0] : vector<1x1xf32>
%375 = splat %374 : vector<4xf32>
%376 = vector.fma %375, %95, %372 : vector<4xf32>
%377 = vector.extract_strided_slice %48 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%378 = vector.extract %377[0, 0] : vector<1x1xf32>
%379 = splat %378 : vector<4xf32>
%380 = vector.fma %379, %100, %376 : vector<4xf32>
%381 = vector.extract_strided_slice %48 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%382 = vector.extract %381[0, 0] : vector<1x1xf32>
%383 = splat %382 : vector<4xf32>
%384 = vector.fma %383, %105, %380 : vector<4xf32>
%385 = vector.extract_strided_slice %49 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%386 = vector.extract %385[0, 0] : vector<1x1xf32>
%387 = splat %386 : vector<4xf32>
%388 = vector.fma %387, %110, %384 : vector<4xf32>
%389 = vector.extract_strided_slice %49 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%390 = vector.extract %389[0, 0] : vector<1x1xf32>
%391 = splat %390 : vector<4xf32>
%392 = vector.fma %391, %115, %388 : vector<4xf32>
%393 = vector.extract_strided_slice %49 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%394 = vector.extract %393[0, 0] : vector<1x1xf32>
%395 = splat %394 : vector<4xf32>
%396 = vector.fma %395, %120, %392 : vector<4xf32>
%397 = vector.extract_strided_slice %49 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%398 = vector.extract %397[0, 0] : vector<1x1xf32>
%399 = splat %398 : vector<4xf32>
%400 = vector.fma %399, %125, %396 : vector<4xf32>
%401 = vector.shape_cast %400 : vector<4xf32> to vector<1x4xf32>
%402 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%403 = vector.extract %402[0, 0] : vector<1x1xf32>
%404 = splat %403 : vector<4xf32>
%405 = vector.shape_cast %81 : vector<1x4xf32> to vector<4xf32>
%406 = vector.fma %404, %89, %405 : vector<4xf32>
%407 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%408 = vector.extract %407[0, 0] : vector<1x1xf32>
%409 = splat %408 : vector<4xf32>
%410 = vector.fma %409, %95, %406 : vector<4xf32>
%411 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%412 = vector.extract %411[0, 0] : vector<1x1xf32>
%413 = splat %412 : vector<4xf32>
%414 = vector.fma %413, %100, %410 : vector<4xf32>
%415 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%416 = vector.extract %415[0, 0] : vector<1x1xf32>
%417 = splat %416 : vector<4xf32>
%418 = vector.fma %417, %105, %414 : vector<4xf32>
%419 = vector.extract_strided_slice %51 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%420 = vector.extract %419[0, 0] : vector<1x1xf32>
%421 = splat %420 : vector<4xf32>
%422 = vector.fma %421, %110, %418 : vector<4xf32>
%423 = vector.extract_strided_slice %51 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%424 = vector.extract %423[0, 0] : vector<1x1xf32>
%425 = splat %424 : vector<4xf32>
%426 = vector.fma %425, %115, %422 : vector<4xf32>
%427 = vector.extract_strided_slice %51 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%428 = vector.extract %427[0, 0] : vector<1x1xf32>
%429 = splat %428 : vector<4xf32>
%430 = vector.fma %429, %120, %426 : vector<4xf32>
%431 = vector.extract_strided_slice %51 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%432 = vector.extract %431[0, 0] : vector<1x1xf32>
%433 = splat %432 : vector<4xf32>
%434 = vector.fma %433, %125, %430 : vector<4xf32>
%435 = vector.shape_cast %434 : vector<4xf32> to vector<1x4xf32>
%436 = vector.extract_strided_slice %52 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%437 = vector.extract %436[0, 0] : vector<1x1xf32>
%438 = splat %437 : vector<4xf32>
%439 = vector.shape_cast %82 : vector<1x4xf32> to vector<4xf32>
%440 = vector.fma %438, %89, %439 : vector<4xf32>
%441 = vector.extract_strided_slice %52 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%442 = vector.extract %441[0, 0] : vector<1x1xf32>
%443 = splat %442 : vector<4xf32>
%444 = vector.fma %443, %95, %440 : vector<4xf32>
%445 = vector.extract_strided_slice %52 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%446 = vector.extract %445[0, 0] : vector<1x1xf32>
%447 = splat %446 : vector<4xf32>
%448 = vector.fma %447, %100, %444 : vector<4xf32>
%449 = vector.extract_strided_slice %52 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%450 = vector.extract %449[0, 0] : vector<1x1xf32>
%451 = splat %450 : vector<4xf32>
%452 = vector.fma %451, %105, %448 : vector<4xf32>
%453 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%454 = vector.extract %453[0, 0] : vector<1x1xf32>
%455 = splat %454 : vector<4xf32>
%456 = vector.fma %455, %110, %452 : vector<4xf32>
%457 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%458 = vector.extract %457[0, 0] : vector<1x1xf32>
%459 = splat %458 : vector<4xf32>
%460 = vector.fma %459, %115, %456 : vector<4xf32>
%461 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%462 = vector.extract %461[0, 0] : vector<1x1xf32>
%463 = splat %462 : vector<4xf32>
%464 = vector.fma %463, %120, %460 : vector<4xf32>
%465 = vector.extract_strided_slice %53 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%466 = vector.extract %465[0, 0] : vector<1x1xf32>
%467 = splat %466 : vector<4xf32>
%468 = vector.fma %467, %125, %464 : vector<4xf32>
%469 = vector.shape_cast %468 : vector<4xf32> to vector<1x4xf32>
%470 = vector.extract_strided_slice %54 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%471 = vector.extract %470[0, 0] : vector<1x1xf32>
%472 = splat %471 : vector<4xf32>
%473 = vector.shape_cast %83 : vector<1x4xf32> to vector<4xf32>
%474 = vector.fma %472, %89, %473 : vector<4xf32>
%475 = vector.extract_strided_slice %54 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%476 = vector.extract %475[0, 0] : vector<1x1xf32>
%477 = splat %476 : vector<4xf32>
%478 = vector.fma %477, %95, %474 : vector<4xf32>
%479 = vector.extract_strided_slice %54 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%480 = vector.extract %479[0, 0] : vector<1x1xf32>
%481 = splat %480 : vector<4xf32>
%482 = vector.fma %481, %100, %478 : vector<4xf32>
%483 = vector.extract_strided_slice %54 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%484 = vector.extract %483[0, 0] : vector<1x1xf32>
%485 = splat %484 : vector<4xf32>
%486 = vector.fma %485, %105, %482 : vector<4xf32>
%487 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%488 = vector.extract %487[0, 0] : vector<1x1xf32>
%489 = splat %488 : vector<4xf32>
%490 = vector.fma %489, %110, %486 : vector<4xf32>
%491 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%492 = vector.extract %491[0, 0] : vector<1x1xf32>
%493 = splat %492 : vector<4xf32>
%494 = vector.fma %493, %115, %490 : vector<4xf32>
%495 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%496 = vector.extract %495[0, 0] : vector<1x1xf32>
%497 = splat %496 : vector<4xf32>
%498 = vector.fma %497, %120, %494 : vector<4xf32>
%499 = vector.extract_strided_slice %55 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%500 = vector.extract %499[0, 0] : vector<1x1xf32>
%501 = splat %500 : vector<4xf32>
%502 = vector.fma %501, %125, %498 : vector<4xf32>
%503 = vector.shape_cast %502 : vector<4xf32> to vector<1x4xf32>
%504 = vector.extract_strided_slice %56 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%505 = vector.extract %504[0, 0] : vector<1x1xf32>
%506 = splat %505 : vector<4xf32>
%507 = vector.shape_cast %84 : vector<1x4xf32> to vector<4xf32>
%508 = vector.fma %506, %89, %507 : vector<4xf32>
%509 = vector.extract_strided_slice %56 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%510 = vector.extract %509[0, 0] : vector<1x1xf32>
%511 = splat %510 : vector<4xf32>
%512 = vector.fma %511, %95, %508 : vector<4xf32>
%513 = vector.extract_strided_slice %56 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%514 = vector.extract %513[0, 0] : vector<1x1xf32>
%515 = splat %514 : vector<4xf32>
%516 = vector.fma %515, %100, %512 : vector<4xf32>
%517 = vector.extract_strided_slice %56 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%518 = vector.extract %517[0, 0] : vector<1x1xf32>
%519 = splat %518 : vector<4xf32>
%520 = vector.fma %519, %105, %516 : vector<4xf32>
%521 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%522 = vector.extract %521[0, 0] : vector<1x1xf32>
%523 = splat %522 : vector<4xf32>
%524 = vector.fma %523, %110, %520 : vector<4xf32>
%525 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%526 = vector.extract %525[0, 0] : vector<1x1xf32>
%527 = splat %526 : vector<4xf32>
%528 = vector.fma %527, %115, %524 : vector<4xf32>
%529 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%530 = vector.extract %529[0, 0] : vector<1x1xf32>
%531 = splat %530 : vector<4xf32>
%532 = vector.fma %531, %120, %528 : vector<4xf32>
%533 = vector.extract_strided_slice %57 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%534 = vector.extract %533[0, 0] : vector<1x1xf32>
%535 = splat %534 : vector<4xf32>
%536 = vector.fma %535, %125, %532 : vector<4xf32>
%537 = vector.shape_cast %536 : vector<4xf32> to vector<1x4xf32>
%538 = vector.extract_strided_slice %58 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%539 = vector.extract %538[0, 0] : vector<1x1xf32>
%540 = splat %539 : vector<4xf32>
%541 = vector.shape_cast %85 : vector<1x4xf32> to vector<4xf32>
%542 = vector.fma %540, %89, %541 : vector<4xf32>
%543 = vector.extract_strided_slice %58 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%544 = vector.extract %543[0, 0] : vector<1x1xf32>
%545 = splat %544 : vector<4xf32>
%546 = vector.fma %545, %95, %542 : vector<4xf32>
%547 = vector.extract_strided_slice %58 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%548 = vector.extract %547[0, 0] : vector<1x1xf32>
%549 = splat %548 : vector<4xf32>
%550 = vector.fma %549, %100, %546 : vector<4xf32>
%551 = vector.extract_strided_slice %58 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%552 = vector.extract %551[0, 0] : vector<1x1xf32>
%553 = splat %552 : vector<4xf32>
%554 = vector.fma %553, %105, %550 : vector<4xf32>
%555 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%556 = vector.extract %555[0, 0] : vector<1x1xf32>
%557 = splat %556 : vector<4xf32>
%558 = vector.fma %557, %110, %554 : vector<4xf32>
%559 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%560 = vector.extract %559[0, 0] : vector<1x1xf32>
%561 = splat %560 : vector<4xf32>
%562 = vector.fma %561, %115, %558 : vector<4xf32>
%563 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%564 = vector.extract %563[0, 0] : vector<1x1xf32>
%565 = splat %564 : vector<4xf32>
%566 = vector.fma %565, %120, %562 : vector<4xf32>
%567 = vector.extract_strided_slice %59 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%568 = vector.extract %567[0, 0] : vector<1x1xf32>
%569 = splat %568 : vector<4xf32>
%570 = vector.fma %569, %125, %566 : vector<4xf32>
%571 = vector.shape_cast %570 : vector<4xf32> to vector<1x4xf32>
%572 = vector.extract_strided_slice %60 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%573 = vector.extract %572[0, 0] : vector<1x1xf32>
%574 = splat %573 : vector<4xf32>
%575 = vector.shape_cast %86 : vector<1x4xf32> to vector<4xf32>
%576 = vector.fma %574, %89, %575 : vector<4xf32>
%577 = vector.extract_strided_slice %60 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%578 = vector.extract %577[0, 0] : vector<1x1xf32>
%579 = splat %578 : vector<4xf32>
%580 = vector.fma %579, %95, %576 : vector<4xf32>
%581 = vector.extract_strided_slice %60 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%582 = vector.extract %581[0, 0] : vector<1x1xf32>
%583 = splat %582 : vector<4xf32>
%584 = vector.fma %583, %100, %580 : vector<4xf32>
%585 = vector.extract_strided_slice %60 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%586 = vector.extract %585[0, 0] : vector<1x1xf32>
%587 = splat %586 : vector<4xf32>
%588 = vector.fma %587, %105, %584 : vector<4xf32>
%589 = vector.extract_strided_slice %61 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%590 = vector.extract %589[0, 0] : vector<1x1xf32>
%591 = splat %590 : vector<4xf32>
%592 = vector.fma %591, %110, %588 : vector<4xf32>
%593 = vector.extract_strided_slice %61 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%594 = vector.extract %593[0, 0] : vector<1x1xf32>
%595 = splat %594 : vector<4xf32>
%596 = vector.fma %595, %115, %592 : vector<4xf32>
%597 = vector.extract_strided_slice %61 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%598 = vector.extract %597[0, 0] : vector<1x1xf32>
%599 = splat %598 : vector<4xf32>
%600 = vector.fma %599, %120, %596 : vector<4xf32>
%601 = vector.extract_strided_slice %61 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%602 = vector.extract %601[0, 0] : vector<1x1xf32>
%603 = splat %602 : vector<4xf32>
%604 = vector.fma %603, %125, %600 : vector<4xf32>
%605 = vector.shape_cast %604 : vector<4xf32> to vector<1x4xf32>
%606 = vector.extract_strided_slice %62 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%607 = vector.extract %606[0, 0] : vector<1x1xf32>
%608 = splat %607 : vector<4xf32>
%609 = vector.shape_cast %87 : vector<1x4xf32> to vector<4xf32>
%610 = vector.fma %608, %89, %609 : vector<4xf32>
%611 = vector.extract_strided_slice %62 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%612 = vector.extract %611[0, 0] : vector<1x1xf32>
%613 = splat %612 : vector<4xf32>
%614 = vector.fma %613, %95, %610 : vector<4xf32>
%615 = vector.extract_strided_slice %62 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%616 = vector.extract %615[0, 0] : vector<1x1xf32>
%617 = splat %616 : vector<4xf32>
%618 = vector.fma %617, %100, %614 : vector<4xf32>
%619 = vector.extract_strided_slice %62 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%620 = vector.extract %619[0, 0] : vector<1x1xf32>
%621 = splat %620 : vector<4xf32>
%622 = vector.fma %621, %105, %618 : vector<4xf32>
%623 = vector.extract_strided_slice %63 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%624 = vector.extract %623[0, 0] : vector<1x1xf32>
%625 = splat %624 : vector<4xf32>
%626 = vector.fma %625, %110, %622 : vector<4xf32>
%627 = vector.extract_strided_slice %63 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%628 = vector.extract %627[0, 0] : vector<1x1xf32>
%629 = splat %628 : vector<4xf32>
%630 = vector.fma %629, %115, %626 : vector<4xf32>
%631 = vector.extract_strided_slice %63 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%632 = vector.extract %631[0, 0] : vector<1x1xf32>
%633 = splat %632 : vector<4xf32>
%634 = vector.fma %633, %120, %630 : vector<4xf32>
%635 = vector.extract_strided_slice %63 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xf32> to vector<1x1xf32>
%636 = vector.extract %635[0, 0] : vector<1x1xf32>
%637 = splat %636 : vector<4xf32>
%638 = vector.fma %637, %125, %634 : vector<4xf32>
%639 = vector.shape_cast %638 : vector<4xf32> to vector<1x4xf32>
vector.transfer_write %129, %17[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %163, %17[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %197, %17[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %231, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %265, %17[%c4, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %299, %17[%c5, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %333, %17[%c6, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %367, %17[%c7, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %401, %17[%c8, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %435, %17[%c9, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %469, %17[%c10, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %503, %17[%c11, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %537, %17[%c12, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %571, %17[%c13, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %605, %17[%c14, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %639, %17[%c15, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
}
return
}
// -----// IR Dump After OptimizeVectorTransfer //----- //
func @_large_aligned_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c15 = constant 15 : index
%c14 = constant 14 : index
%c13 = constant 13 : index
%c12 = constant 12 : index
%c11 = constant 11 : index
%c10 = constant 10 : index
%c9 = constant 9 : index
%c7 = constant 7 : index
%c6 = constant 6 : index
%c5 = constant 5 : index
%c4 = constant 4 : index
%c3 = constant 3 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst_0 = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1024 = constant 1024 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
%12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%14 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%19 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%20 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%21 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%22 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%23 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%24 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%25 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%26 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%27 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%28 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%29 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%30 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%31 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%32 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%33 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%34 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%35 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%36 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%37 = memref.subview %30[%12, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%38 = memref.subview %31[0, %13] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
scf.for %arg0 = %8 to %c2048 step %9 {
%39 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
scf.for %arg1 = %10 to %c512 step %11 {
%40 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%41 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%42 = memref.subview %41[%12, %13] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%43:16 = scf.for %arg2 = %c0 to %c1024 step %c8 iter_args(%arg3 = %14, %arg4 = %15, %arg5 = %16, %arg6 = %17, %arg7 = %18, %arg8 = %19, %arg9 = %20, %arg10 = %21, %arg11 = %22, %arg12 = %23, %arg13 = %24, %arg14 = %25, %arg15 = %26, %arg16 = %27, %arg17 = %28, %arg18 = %29) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%44 = memref.subview %39[0, %arg2] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%45 = memref.subview %40[%arg2, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
gpu.barrier
%46 = vector.transfer_read %44[%32, %33], %cst_0 {in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
vector.transfer_write %46, %30[%32, %33] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%47 = vector.transfer_read %45[%34, %35], %cst_0 {in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%48 = vector.transfer_read %45[%36, %35], %cst_0 {in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
vector.transfer_write %47, %31[%34, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %48, %31[%36, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%49 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%50 = vector.transfer_read %37[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%51 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%52 = vector.transfer_read %37[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%53 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%54 = vector.transfer_read %37[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%55 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%56 = vector.transfer_read %37[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%57 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%58 = vector.transfer_read %37[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%59 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%60 = vector.transfer_read %37[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%61 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%62 = vector.transfer_read %37[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%63 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%64 = vector.transfer_read %37[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%65 = vector.transfer_read %37[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%66 = vector.transfer_read %37[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%67 = vector.transfer_read %37[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%68 = vector.transfer_read %37[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%69 = vector.transfer_read %37[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%70 = vector.transfer_read %37[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%71 = vector.transfer_read %37[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%72 = vector.transfer_read %37[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%73 = vector.transfer_read %37[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%74 = vector.transfer_read %37[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%75 = vector.transfer_read %37[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%76 = vector.transfer_read %37[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%77 = vector.transfer_read %37[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%78 = vector.transfer_read %37[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%79 = vector.transfer_read %37[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%80 = vector.transfer_read %37[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%81 = vector.transfer_read %38[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%82 = vector.transfer_read %38[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%83 = vector.transfer_read %38[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%84 = vector.transfer_read %38[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%85 = vector.transfer_read %38[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%86 = vector.transfer_read %38[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%87 = vector.transfer_read %38[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%88 = vector.transfer_read %38[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%89 = vector.extract_strided_slice %49 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%90 = vector.extract %89[0] : vector<1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %81, %arg3 : vector<4xf32>
%93 = vector.extract_strided_slice %49 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%94 = vector.extract %93[0] : vector<1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.fma %95, %82, %92 : vector<4xf32>
%97 = vector.extract_strided_slice %49 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%98 = vector.extract %97[0] : vector<1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.fma %99, %83, %96 : vector<4xf32>
%101 = vector.extract_strided_slice %49 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%102 = vector.extract %101[0] : vector<1xf32>
%103 = splat %102 : vector<4xf32>
%104 = vector.fma %103, %84, %100 : vector<4xf32>
%105 = vector.extract_strided_slice %50 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%106 = vector.extract %105[0] : vector<1xf32>
%107 = splat %106 : vector<4xf32>
%108 = vector.fma %107, %85, %104 : vector<4xf32>
%109 = vector.extract_strided_slice %50 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%110 = vector.extract %109[0] : vector<1xf32>
%111 = splat %110 : vector<4xf32>
%112 = vector.fma %111, %86, %108 : vector<4xf32>
%113 = vector.extract_strided_slice %50 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%114 = vector.extract %113[0] : vector<1xf32>
%115 = splat %114 : vector<4xf32>
%116 = vector.fma %115, %87, %112 : vector<4xf32>
%117 = vector.extract_strided_slice %50 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %88, %116 : vector<4xf32>
%121 = vector.extract_strided_slice %51 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%122 = vector.extract %121[0] : vector<1xf32>
%123 = splat %122 : vector<4xf32>
%124 = vector.fma %123, %81, %arg4 : vector<4xf32>
%125 = vector.extract_strided_slice %51 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%126 = vector.extract %125[0] : vector<1xf32>
%127 = splat %126 : vector<4xf32>
%128 = vector.fma %127, %82, %124 : vector<4xf32>
%129 = vector.extract_strided_slice %51 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%130 = vector.extract %129[0] : vector<1xf32>
%131 = splat %130 : vector<4xf32>
%132 = vector.fma %131, %83, %128 : vector<4xf32>
%133 = vector.extract_strided_slice %51 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%134 = vector.extract %133[0] : vector<1xf32>
%135 = splat %134 : vector<4xf32>
%136 = vector.fma %135, %84, %132 : vector<4xf32>
%137 = vector.extract_strided_slice %52 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%138 = vector.extract %137[0] : vector<1xf32>
%139 = splat %138 : vector<4xf32>
%140 = vector.fma %139, %85, %136 : vector<4xf32>
%141 = vector.extract_strided_slice %52 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%142 = vector.extract %141[0] : vector<1xf32>
%143 = splat %142 : vector<4xf32>
%144 = vector.fma %143, %86, %140 : vector<4xf32>
%145 = vector.extract_strided_slice %52 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%146 = vector.extract %145[0] : vector<1xf32>
%147 = splat %146 : vector<4xf32>
%148 = vector.fma %147, %87, %144 : vector<4xf32>
%149 = vector.extract_strided_slice %52 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%150 = vector.extract %149[0] : vector<1xf32>
%151 = splat %150 : vector<4xf32>
%152 = vector.fma %151, %88, %148 : vector<4xf32>
%153 = vector.extract_strided_slice %53 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%154 = vector.extract %153[0] : vector<1xf32>
%155 = splat %154 : vector<4xf32>
%156 = vector.fma %155, %81, %arg5 : vector<4xf32>
%157 = vector.extract_strided_slice %53 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%158 = vector.extract %157[0] : vector<1xf32>
%159 = splat %158 : vector<4xf32>
%160 = vector.fma %159, %82, %156 : vector<4xf32>
%161 = vector.extract_strided_slice %53 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%162 = vector.extract %161[0] : vector<1xf32>
%163 = splat %162 : vector<4xf32>
%164 = vector.fma %163, %83, %160 : vector<4xf32>
%165 = vector.extract_strided_slice %53 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%166 = vector.extract %165[0] : vector<1xf32>
%167 = splat %166 : vector<4xf32>
%168 = vector.fma %167, %84, %164 : vector<4xf32>
%169 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%170 = vector.extract %169[0] : vector<1xf32>
%171 = splat %170 : vector<4xf32>
%172 = vector.fma %171, %85, %168 : vector<4xf32>
%173 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%174 = vector.extract %173[0] : vector<1xf32>
%175 = splat %174 : vector<4xf32>
%176 = vector.fma %175, %86, %172 : vector<4xf32>
%177 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%178 = vector.extract %177[0] : vector<1xf32>
%179 = splat %178 : vector<4xf32>
%180 = vector.fma %179, %87, %176 : vector<4xf32>
%181 = vector.extract_strided_slice %54 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%182 = vector.extract %181[0] : vector<1xf32>
%183 = splat %182 : vector<4xf32>
%184 = vector.fma %183, %88, %180 : vector<4xf32>
%185 = vector.extract_strided_slice %55 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%186 = vector.extract %185[0] : vector<1xf32>
%187 = splat %186 : vector<4xf32>
%188 = vector.fma %187, %81, %arg6 : vector<4xf32>
%189 = vector.extract_strided_slice %55 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%190 = vector.extract %189[0] : vector<1xf32>
%191 = splat %190 : vector<4xf32>
%192 = vector.fma %191, %82, %188 : vector<4xf32>
%193 = vector.extract_strided_slice %55 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%194 = vector.extract %193[0] : vector<1xf32>
%195 = splat %194 : vector<4xf32>
%196 = vector.fma %195, %83, %192 : vector<4xf32>
%197 = vector.extract_strided_slice %55 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%198 = vector.extract %197[0] : vector<1xf32>
%199 = splat %198 : vector<4xf32>
%200 = vector.fma %199, %84, %196 : vector<4xf32>
%201 = vector.extract_strided_slice %56 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%202 = vector.extract %201[0] : vector<1xf32>
%203 = splat %202 : vector<4xf32>
%204 = vector.fma %203, %85, %200 : vector<4xf32>
%205 = vector.extract_strided_slice %56 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%206 = vector.extract %205[0] : vector<1xf32>
%207 = splat %206 : vector<4xf32>
%208 = vector.fma %207, %86, %204 : vector<4xf32>
%209 = vector.extract_strided_slice %56 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%210 = vector.extract %209[0] : vector<1xf32>
%211 = splat %210 : vector<4xf32>
%212 = vector.fma %211, %87, %208 : vector<4xf32>
%213 = vector.extract_strided_slice %56 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%214 = vector.extract %213[0] : vector<1xf32>
%215 = splat %214 : vector<4xf32>
%216 = vector.fma %215, %88, %212 : vector<4xf32>
%217 = vector.extract_strided_slice %57 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%218 = vector.extract %217[0] : vector<1xf32>
%219 = splat %218 : vector<4xf32>
%220 = vector.fma %219, %81, %arg7 : vector<4xf32>
%221 = vector.extract_strided_slice %57 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%222 = vector.extract %221[0] : vector<1xf32>
%223 = splat %222 : vector<4xf32>
%224 = vector.fma %223, %82, %220 : vector<4xf32>
%225 = vector.extract_strided_slice %57 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%226 = vector.extract %225[0] : vector<1xf32>
%227 = splat %226 : vector<4xf32>
%228 = vector.fma %227, %83, %224 : vector<4xf32>
%229 = vector.extract_strided_slice %57 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%230 = vector.extract %229[0] : vector<1xf32>
%231 = splat %230 : vector<4xf32>
%232 = vector.fma %231, %84, %228 : vector<4xf32>
%233 = vector.extract_strided_slice %58 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%234 = vector.extract %233[0] : vector<1xf32>
%235 = splat %234 : vector<4xf32>
%236 = vector.fma %235, %85, %232 : vector<4xf32>
%237 = vector.extract_strided_slice %58 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%238 = vector.extract %237[0] : vector<1xf32>
%239 = splat %238 : vector<4xf32>
%240 = vector.fma %239, %86, %236 : vector<4xf32>
%241 = vector.extract_strided_slice %58 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%242 = vector.extract %241[0] : vector<1xf32>
%243 = splat %242 : vector<4xf32>
%244 = vector.fma %243, %87, %240 : vector<4xf32>
%245 = vector.extract_strided_slice %58 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%246 = vector.extract %245[0] : vector<1xf32>
%247 = splat %246 : vector<4xf32>
%248 = vector.fma %247, %88, %244 : vector<4xf32>
%249 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%250 = vector.extract %249[0] : vector<1xf32>
%251 = splat %250 : vector<4xf32>
%252 = vector.fma %251, %81, %arg8 : vector<4xf32>
%253 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%254 = vector.extract %253[0] : vector<1xf32>
%255 = splat %254 : vector<4xf32>
%256 = vector.fma %255, %82, %252 : vector<4xf32>
%257 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%258 = vector.extract %257[0] : vector<1xf32>
%259 = splat %258 : vector<4xf32>
%260 = vector.fma %259, %83, %256 : vector<4xf32>
%261 = vector.extract_strided_slice %59 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%262 = vector.extract %261[0] : vector<1xf32>
%263 = splat %262 : vector<4xf32>
%264 = vector.fma %263, %84, %260 : vector<4xf32>
%265 = vector.extract_strided_slice %60 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%266 = vector.extract %265[0] : vector<1xf32>
%267 = splat %266 : vector<4xf32>
%268 = vector.fma %267, %85, %264 : vector<4xf32>
%269 = vector.extract_strided_slice %60 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%270 = vector.extract %269[0] : vector<1xf32>
%271 = splat %270 : vector<4xf32>
%272 = vector.fma %271, %86, %268 : vector<4xf32>
%273 = vector.extract_strided_slice %60 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%274 = vector.extract %273[0] : vector<1xf32>
%275 = splat %274 : vector<4xf32>
%276 = vector.fma %275, %87, %272 : vector<4xf32>
%277 = vector.extract_strided_slice %60 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%278 = vector.extract %277[0] : vector<1xf32>
%279 = splat %278 : vector<4xf32>
%280 = vector.fma %279, %88, %276 : vector<4xf32>
%281 = vector.extract_strided_slice %61 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%282 = vector.extract %281[0] : vector<1xf32>
%283 = splat %282 : vector<4xf32>
%284 = vector.fma %283, %81, %arg9 : vector<4xf32>
%285 = vector.extract_strided_slice %61 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%286 = vector.extract %285[0] : vector<1xf32>
%287 = splat %286 : vector<4xf32>
%288 = vector.fma %287, %82, %284 : vector<4xf32>
%289 = vector.extract_strided_slice %61 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%290 = vector.extract %289[0] : vector<1xf32>
%291 = splat %290 : vector<4xf32>
%292 = vector.fma %291, %83, %288 : vector<4xf32>
%293 = vector.extract_strided_slice %61 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%294 = vector.extract %293[0] : vector<1xf32>
%295 = splat %294 : vector<4xf32>
%296 = vector.fma %295, %84, %292 : vector<4xf32>
%297 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%298 = vector.extract %297[0] : vector<1xf32>
%299 = splat %298 : vector<4xf32>
%300 = vector.fma %299, %85, %296 : vector<4xf32>
%301 = vector.extract_strided_slice %62 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%302 = vector.extract %301[0] : vector<1xf32>
%303 = splat %302 : vector<4xf32>
%304 = vector.fma %303, %86, %300 : vector<4xf32>
%305 = vector.extract_strided_slice %62 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%306 = vector.extract %305[0] : vector<1xf32>
%307 = splat %306 : vector<4xf32>
%308 = vector.fma %307, %87, %304 : vector<4xf32>
%309 = vector.extract_strided_slice %62 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%310 = vector.extract %309[0] : vector<1xf32>
%311 = splat %310 : vector<4xf32>
%312 = vector.fma %311, %88, %308 : vector<4xf32>
%313 = vector.extract_strided_slice %63 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%314 = vector.extract %313[0] : vector<1xf32>
%315 = splat %314 : vector<4xf32>
%316 = vector.fma %315, %81, %arg10 : vector<4xf32>
%317 = vector.extract_strided_slice %63 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%318 = vector.extract %317[0] : vector<1xf32>
%319 = splat %318 : vector<4xf32>
%320 = vector.fma %319, %82, %316 : vector<4xf32>
%321 = vector.extract_strided_slice %63 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%322 = vector.extract %321[0] : vector<1xf32>
%323 = splat %322 : vector<4xf32>
%324 = vector.fma %323, %83, %320 : vector<4xf32>
%325 = vector.extract_strided_slice %63 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%326 = vector.extract %325[0] : vector<1xf32>
%327 = splat %326 : vector<4xf32>
%328 = vector.fma %327, %84, %324 : vector<4xf32>
%329 = vector.extract_strided_slice %64 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%330 = vector.extract %329[0] : vector<1xf32>
%331 = splat %330 : vector<4xf32>
%332 = vector.fma %331, %85, %328 : vector<4xf32>
%333 = vector.extract_strided_slice %64 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%334 = vector.extract %333[0] : vector<1xf32>
%335 = splat %334 : vector<4xf32>
%336 = vector.fma %335, %86, %332 : vector<4xf32>
%337 = vector.extract_strided_slice %64 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%338 = vector.extract %337[0] : vector<1xf32>
%339 = splat %338 : vector<4xf32>
%340 = vector.fma %339, %87, %336 : vector<4xf32>
%341 = vector.extract_strided_slice %64 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%342 = vector.extract %341[0] : vector<1xf32>
%343 = splat %342 : vector<4xf32>
%344 = vector.fma %343, %88, %340 : vector<4xf32>
%345 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%346 = vector.extract %345[0] : vector<1xf32>
%347 = splat %346 : vector<4xf32>
%348 = vector.fma %347, %81, %arg11 : vector<4xf32>
%349 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%350 = vector.extract %349[0] : vector<1xf32>
%351 = splat %350 : vector<4xf32>
%352 = vector.fma %351, %82, %348 : vector<4xf32>
%353 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%354 = vector.extract %353[0] : vector<1xf32>
%355 = splat %354 : vector<4xf32>
%356 = vector.fma %355, %83, %352 : vector<4xf32>
%357 = vector.extract_strided_slice %65 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%358 = vector.extract %357[0] : vector<1xf32>
%359 = splat %358 : vector<4xf32>
%360 = vector.fma %359, %84, %356 : vector<4xf32>
%361 = vector.extract_strided_slice %66 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%362 = vector.extract %361[0] : vector<1xf32>
%363 = splat %362 : vector<4xf32>
%364 = vector.fma %363, %85, %360 : vector<4xf32>
%365 = vector.extract_strided_slice %66 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%366 = vector.extract %365[0] : vector<1xf32>
%367 = splat %366 : vector<4xf32>
%368 = vector.fma %367, %86, %364 : vector<4xf32>
%369 = vector.extract_strided_slice %66 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%370 = vector.extract %369[0] : vector<1xf32>
%371 = splat %370 : vector<4xf32>
%372 = vector.fma %371, %87, %368 : vector<4xf32>
%373 = vector.extract_strided_slice %66 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%374 = vector.extract %373[0] : vector<1xf32>
%375 = splat %374 : vector<4xf32>
%376 = vector.fma %375, %88, %372 : vector<4xf32>
%377 = vector.extract_strided_slice %67 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%378 = vector.extract %377[0] : vector<1xf32>
%379 = splat %378 : vector<4xf32>
%380 = vector.fma %379, %81, %arg12 : vector<4xf32>
%381 = vector.extract_strided_slice %67 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%382 = vector.extract %381[0] : vector<1xf32>
%383 = splat %382 : vector<4xf32>
%384 = vector.fma %383, %82, %380 : vector<4xf32>
%385 = vector.extract_strided_slice %67 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%386 = vector.extract %385[0] : vector<1xf32>
%387 = splat %386 : vector<4xf32>
%388 = vector.fma %387, %83, %384 : vector<4xf32>
%389 = vector.extract_strided_slice %67 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%390 = vector.extract %389[0] : vector<1xf32>
%391 = splat %390 : vector<4xf32>
%392 = vector.fma %391, %84, %388 : vector<4xf32>
%393 = vector.extract_strided_slice %68 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%394 = vector.extract %393[0] : vector<1xf32>
%395 = splat %394 : vector<4xf32>
%396 = vector.fma %395, %85, %392 : vector<4xf32>
%397 = vector.extract_strided_slice %68 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%398 = vector.extract %397[0] : vector<1xf32>
%399 = splat %398 : vector<4xf32>
%400 = vector.fma %399, %86, %396 : vector<4xf32>
%401 = vector.extract_strided_slice %68 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%402 = vector.extract %401[0] : vector<1xf32>
%403 = splat %402 : vector<4xf32>
%404 = vector.fma %403, %87, %400 : vector<4xf32>
%405 = vector.extract_strided_slice %68 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%406 = vector.extract %405[0] : vector<1xf32>
%407 = splat %406 : vector<4xf32>
%408 = vector.fma %407, %88, %404 : vector<4xf32>
%409 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%410 = vector.extract %409[0] : vector<1xf32>
%411 = splat %410 : vector<4xf32>
%412 = vector.fma %411, %81, %arg13 : vector<4xf32>
%413 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%414 = vector.extract %413[0] : vector<1xf32>
%415 = splat %414 : vector<4xf32>
%416 = vector.fma %415, %82, %412 : vector<4xf32>
%417 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%418 = vector.extract %417[0] : vector<1xf32>
%419 = splat %418 : vector<4xf32>
%420 = vector.fma %419, %83, %416 : vector<4xf32>
%421 = vector.extract_strided_slice %69 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%422 = vector.extract %421[0] : vector<1xf32>
%423 = splat %422 : vector<4xf32>
%424 = vector.fma %423, %84, %420 : vector<4xf32>
%425 = vector.extract_strided_slice %70 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%426 = vector.extract %425[0] : vector<1xf32>
%427 = splat %426 : vector<4xf32>
%428 = vector.fma %427, %85, %424 : vector<4xf32>
%429 = vector.extract_strided_slice %70 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%430 = vector.extract %429[0] : vector<1xf32>
%431 = splat %430 : vector<4xf32>
%432 = vector.fma %431, %86, %428 : vector<4xf32>
%433 = vector.extract_strided_slice %70 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%434 = vector.extract %433[0] : vector<1xf32>
%435 = splat %434 : vector<4xf32>
%436 = vector.fma %435, %87, %432 : vector<4xf32>
%437 = vector.extract_strided_slice %70 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%438 = vector.extract %437[0] : vector<1xf32>
%439 = splat %438 : vector<4xf32>
%440 = vector.fma %439, %88, %436 : vector<4xf32>
%441 = vector.extract_strided_slice %71 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%442 = vector.extract %441[0] : vector<1xf32>
%443 = splat %442 : vector<4xf32>
%444 = vector.fma %443, %81, %arg14 : vector<4xf32>
%445 = vector.extract_strided_slice %71 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%446 = vector.extract %445[0] : vector<1xf32>
%447 = splat %446 : vector<4xf32>
%448 = vector.fma %447, %82, %444 : vector<4xf32>
%449 = vector.extract_strided_slice %71 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%450 = vector.extract %449[0] : vector<1xf32>
%451 = splat %450 : vector<4xf32>
%452 = vector.fma %451, %83, %448 : vector<4xf32>
%453 = vector.extract_strided_slice %71 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%454 = vector.extract %453[0] : vector<1xf32>
%455 = splat %454 : vector<4xf32>
%456 = vector.fma %455, %84, %452 : vector<4xf32>
%457 = vector.extract_strided_slice %72 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%458 = vector.extract %457[0] : vector<1xf32>
%459 = splat %458 : vector<4xf32>
%460 = vector.fma %459, %85, %456 : vector<4xf32>
%461 = vector.extract_strided_slice %72 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%462 = vector.extract %461[0] : vector<1xf32>
%463 = splat %462 : vector<4xf32>
%464 = vector.fma %463, %86, %460 : vector<4xf32>
%465 = vector.extract_strided_slice %72 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%466 = vector.extract %465[0] : vector<1xf32>
%467 = splat %466 : vector<4xf32>
%468 = vector.fma %467, %87, %464 : vector<4xf32>
%469 = vector.extract_strided_slice %72 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%470 = vector.extract %469[0] : vector<1xf32>
%471 = splat %470 : vector<4xf32>
%472 = vector.fma %471, %88, %468 : vector<4xf32>
%473 = vector.extract_strided_slice %73 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%474 = vector.extract %473[0] : vector<1xf32>
%475 = splat %474 : vector<4xf32>
%476 = vector.fma %475, %81, %arg15 : vector<4xf32>
%477 = vector.extract_strided_slice %73 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%478 = vector.extract %477[0] : vector<1xf32>
%479 = splat %478 : vector<4xf32>
%480 = vector.fma %479, %82, %476 : vector<4xf32>
%481 = vector.extract_strided_slice %73 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%482 = vector.extract %481[0] : vector<1xf32>
%483 = splat %482 : vector<4xf32>
%484 = vector.fma %483, %83, %480 : vector<4xf32>
%485 = vector.extract_strided_slice %73 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%486 = vector.extract %485[0] : vector<1xf32>
%487 = splat %486 : vector<4xf32>
%488 = vector.fma %487, %84, %484 : vector<4xf32>
%489 = vector.extract_strided_slice %74 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%490 = vector.extract %489[0] : vector<1xf32>
%491 = splat %490 : vector<4xf32>
%492 = vector.fma %491, %85, %488 : vector<4xf32>
%493 = vector.extract_strided_slice %74 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%494 = vector.extract %493[0] : vector<1xf32>
%495 = splat %494 : vector<4xf32>
%496 = vector.fma %495, %86, %492 : vector<4xf32>
%497 = vector.extract_strided_slice %74 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%498 = vector.extract %497[0] : vector<1xf32>
%499 = splat %498 : vector<4xf32>
%500 = vector.fma %499, %87, %496 : vector<4xf32>
%501 = vector.extract_strided_slice %74 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%502 = vector.extract %501[0] : vector<1xf32>
%503 = splat %502 : vector<4xf32>
%504 = vector.fma %503, %88, %500 : vector<4xf32>
%505 = vector.extract_strided_slice %75 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%506 = vector.extract %505[0] : vector<1xf32>
%507 = splat %506 : vector<4xf32>
%508 = vector.fma %507, %81, %arg16 : vector<4xf32>
%509 = vector.extract_strided_slice %75 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%510 = vector.extract %509[0] : vector<1xf32>
%511 = splat %510 : vector<4xf32>
%512 = vector.fma %511, %82, %508 : vector<4xf32>
%513 = vector.extract_strided_slice %75 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%514 = vector.extract %513[0] : vector<1xf32>
%515 = splat %514 : vector<4xf32>
%516 = vector.fma %515, %83, %512 : vector<4xf32>
%517 = vector.extract_strided_slice %75 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%518 = vector.extract %517[0] : vector<1xf32>
%519 = splat %518 : vector<4xf32>
%520 = vector.fma %519, %84, %516 : vector<4xf32>
%521 = vector.extract_strided_slice %76 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%522 = vector.extract %521[0] : vector<1xf32>
%523 = splat %522 : vector<4xf32>
%524 = vector.fma %523, %85, %520 : vector<4xf32>
%525 = vector.extract_strided_slice %76 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%526 = vector.extract %525[0] : vector<1xf32>
%527 = splat %526 : vector<4xf32>
%528 = vector.fma %527, %86, %524 : vector<4xf32>
%529 = vector.extract_strided_slice %76 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%530 = vector.extract %529[0] : vector<1xf32>
%531 = splat %530 : vector<4xf32>
%532 = vector.fma %531, %87, %528 : vector<4xf32>
%533 = vector.extract_strided_slice %76 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%534 = vector.extract %533[0] : vector<1xf32>
%535 = splat %534 : vector<4xf32>
%536 = vector.fma %535, %88, %532 : vector<4xf32>
%537 = vector.extract_strided_slice %77 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%538 = vector.extract %537[0] : vector<1xf32>
%539 = splat %538 : vector<4xf32>
%540 = vector.fma %539, %81, %arg17 : vector<4xf32>
%541 = vector.extract_strided_slice %77 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%542 = vector.extract %541[0] : vector<1xf32>
%543 = splat %542 : vector<4xf32>
%544 = vector.fma %543, %82, %540 : vector<4xf32>
%545 = vector.extract_strided_slice %77 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%546 = vector.extract %545[0] : vector<1xf32>
%547 = splat %546 : vector<4xf32>
%548 = vector.fma %547, %83, %544 : vector<4xf32>
%549 = vector.extract_strided_slice %77 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%550 = vector.extract %549[0] : vector<1xf32>
%551 = splat %550 : vector<4xf32>
%552 = vector.fma %551, %84, %548 : vector<4xf32>
%553 = vector.extract_strided_slice %78 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%554 = vector.extract %553[0] : vector<1xf32>
%555 = splat %554 : vector<4xf32>
%556 = vector.fma %555, %85, %552 : vector<4xf32>
%557 = vector.extract_strided_slice %78 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%558 = vector.extract %557[0] : vector<1xf32>
%559 = splat %558 : vector<4xf32>
%560 = vector.fma %559, %86, %556 : vector<4xf32>
%561 = vector.extract_strided_slice %78 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%562 = vector.extract %561[0] : vector<1xf32>
%563 = splat %562 : vector<4xf32>
%564 = vector.fma %563, %87, %560 : vector<4xf32>
%565 = vector.extract_strided_slice %78 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%566 = vector.extract %565[0] : vector<1xf32>
%567 = splat %566 : vector<4xf32>
%568 = vector.fma %567, %88, %564 : vector<4xf32>
%569 = vector.extract_strided_slice %79 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%570 = vector.extract %569[0] : vector<1xf32>
%571 = splat %570 : vector<4xf32>
%572 = vector.fma %571, %81, %arg18 : vector<4xf32>
%573 = vector.extract_strided_slice %79 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%574 = vector.extract %573[0] : vector<1xf32>
%575 = splat %574 : vector<4xf32>
%576 = vector.fma %575, %82, %572 : vector<4xf32>
%577 = vector.extract_strided_slice %79 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%578 = vector.extract %577[0] : vector<1xf32>
%579 = splat %578 : vector<4xf32>
%580 = vector.fma %579, %83, %576 : vector<4xf32>
%581 = vector.extract_strided_slice %79 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%582 = vector.extract %581[0] : vector<1xf32>
%583 = splat %582 : vector<4xf32>
%584 = vector.fma %583, %84, %580 : vector<4xf32>
%585 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%586 = vector.extract %585[0] : vector<1xf32>
%587 = splat %586 : vector<4xf32>
%588 = vector.fma %587, %85, %584 : vector<4xf32>
%589 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%590 = vector.extract %589[0] : vector<1xf32>
%591 = splat %590 : vector<4xf32>
%592 = vector.fma %591, %86, %588 : vector<4xf32>
%593 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%594 = vector.extract %593[0] : vector<1xf32>
%595 = splat %594 : vector<4xf32>
%596 = vector.fma %595, %87, %592 : vector<4xf32>
%597 = vector.extract_strided_slice %80 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%598 = vector.extract %597[0] : vector<1xf32>
%599 = splat %598 : vector<4xf32>
%600 = vector.fma %599, %88, %596 : vector<4xf32>
scf.yield %120, %152, %184, %216, %248, %280, %312, %344, %376, %408, %440, %472, %504, %536, %568, %600 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
vector.transfer_write %43#15, %42[%c15, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#14, %42[%c14, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#13, %42[%c13, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#12, %42[%c12, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#11, %42[%c11, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#10, %42[%c10, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#9, %42[%c9, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#8, %42[%c8, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#7, %42[%c7, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#6, %42[%c6, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#5, %42[%c5, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#4, %42[%c4, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#3, %42[%c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#2, %42[%c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#1, %42[%c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %43#0, %42[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
return
}
// -----// IR Dump After LLVMGPUPipelining //----- //
func @_large_aligned_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c15 = constant 15 : index
%c14 = constant 14 : index
%c13 = constant 13 : index
%c12 = constant 12 : index
%c11 = constant 11 : index
%c10 = constant 10 : index
%c9 = constant 9 : index
%c7 = constant 7 : index
%c6 = constant 6 : index
%c5 = constant 5 : index
%c4 = constant 4 : index
%c3 = constant 3 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst_0 = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1016 = constant 1016 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
%12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%14 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%19 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%20 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%21 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%22 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%23 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%24 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%25 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%26 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%27 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%28 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%29 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%30 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%31 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%32 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%33 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%34 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%35 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%36 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%37 = memref.subview %30[%12, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%38 = memref.subview %31[0, %13] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
scf.for %arg0 = %8 to %c2048 step %9 {
%39 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
scf.for %arg1 = %10 to %c512 step %11 {
%40 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%41 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%42 = memref.subview %41[%12, %13] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%43 = memref.subview %39[0, %c0] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%44 = memref.subview %40[%c0, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%45 = vector.transfer_read %43[%32, %33], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%46 = vector.transfer_read %44[%34, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%47 = vector.transfer_read %44[%36, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%48:19 = scf.for %arg2 = %c0 to %c1016 step %c8 iter_args(%arg3 = %14, %arg4 = %15, %arg5 = %16, %arg6 = %17, %arg7 = %18, %arg8 = %19, %arg9 = %20, %arg10 = %21, %arg11 = %22, %arg12 = %23, %arg13 = %24, %arg14 = %25, %arg15 = %26, %arg16 = %27, %arg17 = %28, %arg18 = %29, %arg19 = %45, %arg20 = %46, %arg21 = %47) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
gpu.barrier
vector.transfer_write %arg19, %30[%32, %33] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %arg20, %31[%34, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %arg21, %31[%36, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%601 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%602 = vector.transfer_read %37[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%603 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%604 = vector.transfer_read %37[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%605 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%606 = vector.transfer_read %37[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%607 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%608 = vector.transfer_read %37[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%609 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%610 = vector.transfer_read %37[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%611 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%612 = vector.transfer_read %37[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%613 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%614 = vector.transfer_read %37[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%615 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%616 = vector.transfer_read %37[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%617 = vector.transfer_read %37[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%618 = vector.transfer_read %37[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%619 = vector.transfer_read %37[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%620 = vector.transfer_read %37[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%621 = vector.transfer_read %37[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%622 = vector.transfer_read %37[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%623 = vector.transfer_read %37[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%624 = vector.transfer_read %37[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%625 = vector.transfer_read %37[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%626 = vector.transfer_read %37[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%627 = vector.transfer_read %37[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%628 = vector.transfer_read %37[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%629 = vector.transfer_read %37[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%630 = vector.transfer_read %37[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%631 = vector.transfer_read %37[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%632 = vector.transfer_read %37[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%633 = vector.transfer_read %38[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%634 = vector.transfer_read %38[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%635 = vector.transfer_read %38[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%636 = vector.transfer_read %38[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%637 = vector.transfer_read %38[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%638 = vector.transfer_read %38[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%639 = vector.transfer_read %38[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%640 = vector.transfer_read %38[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%641 = vector.extract_strided_slice %601 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%642 = vector.extract %641[0] : vector<1xf32>
%643 = splat %642 : vector<4xf32>
%644 = vector.fma %643, %633, %arg3 : vector<4xf32>
%645 = vector.extract_strided_slice %601 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%646 = vector.extract %645[0] : vector<1xf32>
%647 = splat %646 : vector<4xf32>
%648 = vector.fma %647, %634, %644 : vector<4xf32>
%649 = vector.extract_strided_slice %601 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%650 = vector.extract %649[0] : vector<1xf32>
%651 = splat %650 : vector<4xf32>
%652 = vector.fma %651, %635, %648 : vector<4xf32>
%653 = vector.extract_strided_slice %601 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%654 = vector.extract %653[0] : vector<1xf32>
%655 = splat %654 : vector<4xf32>
%656 = vector.fma %655, %636, %652 : vector<4xf32>
%657 = vector.extract_strided_slice %602 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%658 = vector.extract %657[0] : vector<1xf32>
%659 = splat %658 : vector<4xf32>
%660 = vector.fma %659, %637, %656 : vector<4xf32>
%661 = vector.extract_strided_slice %602 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%662 = vector.extract %661[0] : vector<1xf32>
%663 = splat %662 : vector<4xf32>
%664 = vector.fma %663, %638, %660 : vector<4xf32>
%665 = vector.extract_strided_slice %602 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%666 = vector.extract %665[0] : vector<1xf32>
%667 = splat %666 : vector<4xf32>
%668 = vector.fma %667, %639, %664 : vector<4xf32>
%669 = vector.extract_strided_slice %602 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%670 = vector.extract %669[0] : vector<1xf32>
%671 = splat %670 : vector<4xf32>
%672 = vector.fma %671, %640, %668 : vector<4xf32>
%673 = vector.extract_strided_slice %603 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%674 = vector.extract %673[0] : vector<1xf32>
%675 = splat %674 : vector<4xf32>
%676 = vector.fma %675, %633, %arg4 : vector<4xf32>
%677 = vector.extract_strided_slice %603 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%678 = vector.extract %677[0] : vector<1xf32>
%679 = splat %678 : vector<4xf32>
%680 = vector.fma %679, %634, %676 : vector<4xf32>
%681 = vector.extract_strided_slice %603 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%682 = vector.extract %681[0] : vector<1xf32>
%683 = splat %682 : vector<4xf32>
%684 = vector.fma %683, %635, %680 : vector<4xf32>
%685 = vector.extract_strided_slice %603 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%686 = vector.extract %685[0] : vector<1xf32>
%687 = splat %686 : vector<4xf32>
%688 = vector.fma %687, %636, %684 : vector<4xf32>
%689 = vector.extract_strided_slice %604 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%690 = vector.extract %689[0] : vector<1xf32>
%691 = splat %690 : vector<4xf32>
%692 = vector.fma %691, %637, %688 : vector<4xf32>
%693 = vector.extract_strided_slice %604 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%694 = vector.extract %693[0] : vector<1xf32>
%695 = splat %694 : vector<4xf32>
%696 = vector.fma %695, %638, %692 : vector<4xf32>
%697 = vector.extract_strided_slice %604 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%698 = vector.extract %697[0] : vector<1xf32>
%699 = splat %698 : vector<4xf32>
%700 = vector.fma %699, %639, %696 : vector<4xf32>
%701 = vector.extract_strided_slice %604 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%702 = vector.extract %701[0] : vector<1xf32>
%703 = splat %702 : vector<4xf32>
%704 = vector.fma %703, %640, %700 : vector<4xf32>
%705 = vector.extract_strided_slice %605 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%706 = vector.extract %705[0] : vector<1xf32>
%707 = splat %706 : vector<4xf32>
%708 = vector.fma %707, %633, %arg5 : vector<4xf32>
%709 = vector.extract_strided_slice %605 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%710 = vector.extract %709[0] : vector<1xf32>
%711 = splat %710 : vector<4xf32>
%712 = vector.fma %711, %634, %708 : vector<4xf32>
%713 = vector.extract_strided_slice %605 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%714 = vector.extract %713[0] : vector<1xf32>
%715 = splat %714 : vector<4xf32>
%716 = vector.fma %715, %635, %712 : vector<4xf32>
%717 = vector.extract_strided_slice %605 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%718 = vector.extract %717[0] : vector<1xf32>
%719 = splat %718 : vector<4xf32>
%720 = vector.fma %719, %636, %716 : vector<4xf32>
%721 = vector.extract_strided_slice %606 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%722 = vector.extract %721[0] : vector<1xf32>
%723 = splat %722 : vector<4xf32>
%724 = vector.fma %723, %637, %720 : vector<4xf32>
%725 = vector.extract_strided_slice %606 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%726 = vector.extract %725[0] : vector<1xf32>
%727 = splat %726 : vector<4xf32>
%728 = vector.fma %727, %638, %724 : vector<4xf32>
%729 = vector.extract_strided_slice %606 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%730 = vector.extract %729[0] : vector<1xf32>
%731 = splat %730 : vector<4xf32>
%732 = vector.fma %731, %639, %728 : vector<4xf32>
%733 = vector.extract_strided_slice %606 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%734 = vector.extract %733[0] : vector<1xf32>
%735 = splat %734 : vector<4xf32>
%736 = vector.fma %735, %640, %732 : vector<4xf32>
%737 = vector.extract_strided_slice %607 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%738 = vector.extract %737[0] : vector<1xf32>
%739 = splat %738 : vector<4xf32>
%740 = vector.fma %739, %633, %arg6 : vector<4xf32>
%741 = vector.extract_strided_slice %607 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%742 = vector.extract %741[0] : vector<1xf32>
%743 = splat %742 : vector<4xf32>
%744 = vector.fma %743, %634, %740 : vector<4xf32>
%745 = vector.extract_strided_slice %607 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%746 = vector.extract %745[0] : vector<1xf32>
%747 = splat %746 : vector<4xf32>
%748 = vector.fma %747, %635, %744 : vector<4xf32>
%749 = vector.extract_strided_slice %607 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%750 = vector.extract %749[0] : vector<1xf32>
%751 = splat %750 : vector<4xf32>
%752 = vector.fma %751, %636, %748 : vector<4xf32>
%753 = vector.extract_strided_slice %608 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%754 = vector.extract %753[0] : vector<1xf32>
%755 = splat %754 : vector<4xf32>
%756 = vector.fma %755, %637, %752 : vector<4xf32>
%757 = vector.extract_strided_slice %608 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%758 = vector.extract %757[0] : vector<1xf32>
%759 = splat %758 : vector<4xf32>
%760 = vector.fma %759, %638, %756 : vector<4xf32>
%761 = vector.extract_strided_slice %608 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%762 = vector.extract %761[0] : vector<1xf32>
%763 = splat %762 : vector<4xf32>
%764 = vector.fma %763, %639, %760 : vector<4xf32>
%765 = vector.extract_strided_slice %608 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%766 = vector.extract %765[0] : vector<1xf32>
%767 = splat %766 : vector<4xf32>
%768 = vector.fma %767, %640, %764 : vector<4xf32>
%769 = vector.extract_strided_slice %609 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%770 = vector.extract %769[0] : vector<1xf32>
%771 = splat %770 : vector<4xf32>
%772 = vector.fma %771, %633, %arg7 : vector<4xf32>
%773 = vector.extract_strided_slice %609 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%774 = vector.extract %773[0] : vector<1xf32>
%775 = splat %774 : vector<4xf32>
%776 = vector.fma %775, %634, %772 : vector<4xf32>
%777 = vector.extract_strided_slice %609 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%778 = vector.extract %777[0] : vector<1xf32>
%779 = splat %778 : vector<4xf32>
%780 = vector.fma %779, %635, %776 : vector<4xf32>
%781 = vector.extract_strided_slice %609 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%782 = vector.extract %781[0] : vector<1xf32>
%783 = splat %782 : vector<4xf32>
%784 = vector.fma %783, %636, %780 : vector<4xf32>
%785 = vector.extract_strided_slice %610 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%786 = vector.extract %785[0] : vector<1xf32>
%787 = splat %786 : vector<4xf32>
%788 = vector.fma %787, %637, %784 : vector<4xf32>
%789 = vector.extract_strided_slice %610 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%790 = vector.extract %789[0] : vector<1xf32>
%791 = splat %790 : vector<4xf32>
%792 = vector.fma %791, %638, %788 : vector<4xf32>
%793 = vector.extract_strided_slice %610 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%794 = vector.extract %793[0] : vector<1xf32>
%795 = splat %794 : vector<4xf32>
%796 = vector.fma %795, %639, %792 : vector<4xf32>
%797 = vector.extract_strided_slice %610 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%798 = vector.extract %797[0] : vector<1xf32>
%799 = splat %798 : vector<4xf32>
%800 = vector.fma %799, %640, %796 : vector<4xf32>
%801 = vector.extract_strided_slice %611 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%802 = vector.extract %801[0] : vector<1xf32>
%803 = splat %802 : vector<4xf32>
%804 = vector.fma %803, %633, %arg8 : vector<4xf32>
%805 = vector.extract_strided_slice %611 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%806 = vector.extract %805[0] : vector<1xf32>
%807 = splat %806 : vector<4xf32>
%808 = vector.fma %807, %634, %804 : vector<4xf32>
%809 = vector.extract_strided_slice %611 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%810 = vector.extract %809[0] : vector<1xf32>
%811 = splat %810 : vector<4xf32>
%812 = vector.fma %811, %635, %808 : vector<4xf32>
%813 = vector.extract_strided_slice %611 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%814 = vector.extract %813[0] : vector<1xf32>
%815 = splat %814 : vector<4xf32>
%816 = vector.fma %815, %636, %812 : vector<4xf32>
%817 = vector.extract_strided_slice %612 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%818 = vector.extract %817[0] : vector<1xf32>
%819 = splat %818 : vector<4xf32>
%820 = vector.fma %819, %637, %816 : vector<4xf32>
%821 = vector.extract_strided_slice %612 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%822 = vector.extract %821[0] : vector<1xf32>
%823 = splat %822 : vector<4xf32>
%824 = vector.fma %823, %638, %820 : vector<4xf32>
%825 = vector.extract_strided_slice %612 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%826 = vector.extract %825[0] : vector<1xf32>
%827 = splat %826 : vector<4xf32>
%828 = vector.fma %827, %639, %824 : vector<4xf32>
%829 = vector.extract_strided_slice %612 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%830 = vector.extract %829[0] : vector<1xf32>
%831 = splat %830 : vector<4xf32>
%832 = vector.fma %831, %640, %828 : vector<4xf32>
%833 = vector.extract_strided_slice %613 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%834 = vector.extract %833[0] : vector<1xf32>
%835 = splat %834 : vector<4xf32>
%836 = vector.fma %835, %633, %arg9 : vector<4xf32>
%837 = vector.extract_strided_slice %613 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%838 = vector.extract %837[0] : vector<1xf32>
%839 = splat %838 : vector<4xf32>
%840 = vector.fma %839, %634, %836 : vector<4xf32>
%841 = vector.extract_strided_slice %613 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%842 = vector.extract %841[0] : vector<1xf32>
%843 = splat %842 : vector<4xf32>
%844 = vector.fma %843, %635, %840 : vector<4xf32>
%845 = vector.extract_strided_slice %613 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%846 = vector.extract %845[0] : vector<1xf32>
%847 = splat %846 : vector<4xf32>
%848 = vector.fma %847, %636, %844 : vector<4xf32>
%849 = vector.extract_strided_slice %614 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%850 = vector.extract %849[0] : vector<1xf32>
%851 = splat %850 : vector<4xf32>
%852 = vector.fma %851, %637, %848 : vector<4xf32>
%853 = vector.extract_strided_slice %614 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%854 = vector.extract %853[0] : vector<1xf32>
%855 = splat %854 : vector<4xf32>
%856 = vector.fma %855, %638, %852 : vector<4xf32>
%857 = vector.extract_strided_slice %614 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%858 = vector.extract %857[0] : vector<1xf32>
%859 = splat %858 : vector<4xf32>
%860 = vector.fma %859, %639, %856 : vector<4xf32>
%861 = vector.extract_strided_slice %614 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%862 = vector.extract %861[0] : vector<1xf32>
%863 = splat %862 : vector<4xf32>
%864 = vector.fma %863, %640, %860 : vector<4xf32>
%865 = vector.extract_strided_slice %615 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%866 = vector.extract %865[0] : vector<1xf32>
%867 = splat %866 : vector<4xf32>
%868 = vector.fma %867, %633, %arg10 : vector<4xf32>
%869 = vector.extract_strided_slice %615 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%870 = vector.extract %869[0] : vector<1xf32>
%871 = splat %870 : vector<4xf32>
%872 = vector.fma %871, %634, %868 : vector<4xf32>
%873 = vector.extract_strided_slice %615 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%874 = vector.extract %873[0] : vector<1xf32>
%875 = splat %874 : vector<4xf32>
%876 = vector.fma %875, %635, %872 : vector<4xf32>
%877 = vector.extract_strided_slice %615 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%878 = vector.extract %877[0] : vector<1xf32>
%879 = splat %878 : vector<4xf32>
%880 = vector.fma %879, %636, %876 : vector<4xf32>
%881 = vector.extract_strided_slice %616 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%882 = vector.extract %881[0] : vector<1xf32>
%883 = splat %882 : vector<4xf32>
%884 = vector.fma %883, %637, %880 : vector<4xf32>
%885 = vector.extract_strided_slice %616 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%886 = vector.extract %885[0] : vector<1xf32>
%887 = splat %886 : vector<4xf32>
%888 = vector.fma %887, %638, %884 : vector<4xf32>
%889 = vector.extract_strided_slice %616 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%890 = vector.extract %889[0] : vector<1xf32>
%891 = splat %890 : vector<4xf32>
%892 = vector.fma %891, %639, %888 : vector<4xf32>
%893 = vector.extract_strided_slice %616 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%894 = vector.extract %893[0] : vector<1xf32>
%895 = splat %894 : vector<4xf32>
%896 = vector.fma %895, %640, %892 : vector<4xf32>
%897 = vector.extract_strided_slice %617 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%898 = vector.extract %897[0] : vector<1xf32>
%899 = splat %898 : vector<4xf32>
%900 = vector.fma %899, %633, %arg11 : vector<4xf32>
%901 = vector.extract_strided_slice %617 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%902 = vector.extract %901[0] : vector<1xf32>
%903 = splat %902 : vector<4xf32>
%904 = vector.fma %903, %634, %900 : vector<4xf32>
%905 = vector.extract_strided_slice %617 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%906 = vector.extract %905[0] : vector<1xf32>
%907 = splat %906 : vector<4xf32>
%908 = vector.fma %907, %635, %904 : vector<4xf32>
%909 = vector.extract_strided_slice %617 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%910 = vector.extract %909[0] : vector<1xf32>
%911 = splat %910 : vector<4xf32>
%912 = vector.fma %911, %636, %908 : vector<4xf32>
%913 = vector.extract_strided_slice %618 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%914 = vector.extract %913[0] : vector<1xf32>
%915 = splat %914 : vector<4xf32>
%916 = vector.fma %915, %637, %912 : vector<4xf32>
%917 = vector.extract_strided_slice %618 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%918 = vector.extract %917[0] : vector<1xf32>
%919 = splat %918 : vector<4xf32>
%920 = vector.fma %919, %638, %916 : vector<4xf32>
%921 = vector.extract_strided_slice %618 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%922 = vector.extract %921[0] : vector<1xf32>
%923 = splat %922 : vector<4xf32>
%924 = vector.fma %923, %639, %920 : vector<4xf32>
%925 = vector.extract_strided_slice %618 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%926 = vector.extract %925[0] : vector<1xf32>
%927 = splat %926 : vector<4xf32>
%928 = vector.fma %927, %640, %924 : vector<4xf32>
%929 = vector.extract_strided_slice %619 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%930 = vector.extract %929[0] : vector<1xf32>
%931 = splat %930 : vector<4xf32>
%932 = vector.fma %931, %633, %arg12 : vector<4xf32>
%933 = vector.extract_strided_slice %619 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%934 = vector.extract %933[0] : vector<1xf32>
%935 = splat %934 : vector<4xf32>
%936 = vector.fma %935, %634, %932 : vector<4xf32>
%937 = vector.extract_strided_slice %619 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%938 = vector.extract %937[0] : vector<1xf32>
%939 = splat %938 : vector<4xf32>
%940 = vector.fma %939, %635, %936 : vector<4xf32>
%941 = vector.extract_strided_slice %619 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%942 = vector.extract %941[0] : vector<1xf32>
%943 = splat %942 : vector<4xf32>
%944 = vector.fma %943, %636, %940 : vector<4xf32>
%945 = vector.extract_strided_slice %620 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%946 = vector.extract %945[0] : vector<1xf32>
%947 = splat %946 : vector<4xf32>
%948 = vector.fma %947, %637, %944 : vector<4xf32>
%949 = vector.extract_strided_slice %620 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%950 = vector.extract %949[0] : vector<1xf32>
%951 = splat %950 : vector<4xf32>
%952 = vector.fma %951, %638, %948 : vector<4xf32>
%953 = vector.extract_strided_slice %620 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%954 = vector.extract %953[0] : vector<1xf32>
%955 = splat %954 : vector<4xf32>
%956 = vector.fma %955, %639, %952 : vector<4xf32>
%957 = vector.extract_strided_slice %620 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%958 = vector.extract %957[0] : vector<1xf32>
%959 = splat %958 : vector<4xf32>
%960 = vector.fma %959, %640, %956 : vector<4xf32>
%961 = vector.extract_strided_slice %621 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%962 = vector.extract %961[0] : vector<1xf32>
%963 = splat %962 : vector<4xf32>
%964 = vector.fma %963, %633, %arg13 : vector<4xf32>
%965 = vector.extract_strided_slice %621 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%966 = vector.extract %965[0] : vector<1xf32>
%967 = splat %966 : vector<4xf32>
%968 = vector.fma %967, %634, %964 : vector<4xf32>
%969 = vector.extract_strided_slice %621 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%970 = vector.extract %969[0] : vector<1xf32>
%971 = splat %970 : vector<4xf32>
%972 = vector.fma %971, %635, %968 : vector<4xf32>
%973 = vector.extract_strided_slice %621 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%974 = vector.extract %973[0] : vector<1xf32>
%975 = splat %974 : vector<4xf32>
%976 = vector.fma %975, %636, %972 : vector<4xf32>
%977 = vector.extract_strided_slice %622 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%978 = vector.extract %977[0] : vector<1xf32>
%979 = splat %978 : vector<4xf32>
%980 = vector.fma %979, %637, %976 : vector<4xf32>
%981 = vector.extract_strided_slice %622 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%982 = vector.extract %981[0] : vector<1xf32>
%983 = splat %982 : vector<4xf32>
%984 = vector.fma %983, %638, %980 : vector<4xf32>
%985 = vector.extract_strided_slice %622 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%986 = vector.extract %985[0] : vector<1xf32>
%987 = splat %986 : vector<4xf32>
%988 = vector.fma %987, %639, %984 : vector<4xf32>
%989 = vector.extract_strided_slice %622 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%990 = vector.extract %989[0] : vector<1xf32>
%991 = splat %990 : vector<4xf32>
%992 = vector.fma %991, %640, %988 : vector<4xf32>
%993 = vector.extract_strided_slice %623 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%994 = vector.extract %993[0] : vector<1xf32>
%995 = splat %994 : vector<4xf32>
%996 = vector.fma %995, %633, %arg14 : vector<4xf32>
%997 = vector.extract_strided_slice %623 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%998 = vector.extract %997[0] : vector<1xf32>
%999 = splat %998 : vector<4xf32>
%1000 = vector.fma %999, %634, %996 : vector<4xf32>
%1001 = vector.extract_strided_slice %623 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1002 = vector.extract %1001[0] : vector<1xf32>
%1003 = splat %1002 : vector<4xf32>
%1004 = vector.fma %1003, %635, %1000 : vector<4xf32>
%1005 = vector.extract_strided_slice %623 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1006 = vector.extract %1005[0] : vector<1xf32>
%1007 = splat %1006 : vector<4xf32>
%1008 = vector.fma %1007, %636, %1004 : vector<4xf32>
%1009 = vector.extract_strided_slice %624 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1010 = vector.extract %1009[0] : vector<1xf32>
%1011 = splat %1010 : vector<4xf32>
%1012 = vector.fma %1011, %637, %1008 : vector<4xf32>
%1013 = vector.extract_strided_slice %624 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1014 = vector.extract %1013[0] : vector<1xf32>
%1015 = splat %1014 : vector<4xf32>
%1016 = vector.fma %1015, %638, %1012 : vector<4xf32>
%1017 = vector.extract_strided_slice %624 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1018 = vector.extract %1017[0] : vector<1xf32>
%1019 = splat %1018 : vector<4xf32>
%1020 = vector.fma %1019, %639, %1016 : vector<4xf32>
%1021 = vector.extract_strided_slice %624 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1022 = vector.extract %1021[0] : vector<1xf32>
%1023 = splat %1022 : vector<4xf32>
%1024 = vector.fma %1023, %640, %1020 : vector<4xf32>
%1025 = vector.extract_strided_slice %625 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1026 = vector.extract %1025[0] : vector<1xf32>
%1027 = splat %1026 : vector<4xf32>
%1028 = vector.fma %1027, %633, %arg15 : vector<4xf32>
%1029 = vector.extract_strided_slice %625 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1030 = vector.extract %1029[0] : vector<1xf32>
%1031 = splat %1030 : vector<4xf32>
%1032 = vector.fma %1031, %634, %1028 : vector<4xf32>
%1033 = vector.extract_strided_slice %625 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1034 = vector.extract %1033[0] : vector<1xf32>
%1035 = splat %1034 : vector<4xf32>
%1036 = vector.fma %1035, %635, %1032 : vector<4xf32>
%1037 = vector.extract_strided_slice %625 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1038 = vector.extract %1037[0] : vector<1xf32>
%1039 = splat %1038 : vector<4xf32>
%1040 = vector.fma %1039, %636, %1036 : vector<4xf32>
%1041 = vector.extract_strided_slice %626 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1042 = vector.extract %1041[0] : vector<1xf32>
%1043 = splat %1042 : vector<4xf32>
%1044 = vector.fma %1043, %637, %1040 : vector<4xf32>
%1045 = vector.extract_strided_slice %626 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1046 = vector.extract %1045[0] : vector<1xf32>
%1047 = splat %1046 : vector<4xf32>
%1048 = vector.fma %1047, %638, %1044 : vector<4xf32>
%1049 = vector.extract_strided_slice %626 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1050 = vector.extract %1049[0] : vector<1xf32>
%1051 = splat %1050 : vector<4xf32>
%1052 = vector.fma %1051, %639, %1048 : vector<4xf32>
%1053 = vector.extract_strided_slice %626 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1054 = vector.extract %1053[0] : vector<1xf32>
%1055 = splat %1054 : vector<4xf32>
%1056 = vector.fma %1055, %640, %1052 : vector<4xf32>
%1057 = vector.extract_strided_slice %627 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1058 = vector.extract %1057[0] : vector<1xf32>
%1059 = splat %1058 : vector<4xf32>
%1060 = vector.fma %1059, %633, %arg16 : vector<4xf32>
%1061 = vector.extract_strided_slice %627 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1062 = vector.extract %1061[0] : vector<1xf32>
%1063 = splat %1062 : vector<4xf32>
%1064 = vector.fma %1063, %634, %1060 : vector<4xf32>
%1065 = vector.extract_strided_slice %627 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1066 = vector.extract %1065[0] : vector<1xf32>
%1067 = splat %1066 : vector<4xf32>
%1068 = vector.fma %1067, %635, %1064 : vector<4xf32>
%1069 = vector.extract_strided_slice %627 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1070 = vector.extract %1069[0] : vector<1xf32>
%1071 = splat %1070 : vector<4xf32>
%1072 = vector.fma %1071, %636, %1068 : vector<4xf32>
%1073 = vector.extract_strided_slice %628 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1074 = vector.extract %1073[0] : vector<1xf32>
%1075 = splat %1074 : vector<4xf32>
%1076 = vector.fma %1075, %637, %1072 : vector<4xf32>
%1077 = vector.extract_strided_slice %628 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1078 = vector.extract %1077[0] : vector<1xf32>
%1079 = splat %1078 : vector<4xf32>
%1080 = vector.fma %1079, %638, %1076 : vector<4xf32>
%1081 = vector.extract_strided_slice %628 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1082 = vector.extract %1081[0] : vector<1xf32>
%1083 = splat %1082 : vector<4xf32>
%1084 = vector.fma %1083, %639, %1080 : vector<4xf32>
%1085 = vector.extract_strided_slice %628 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1086 = vector.extract %1085[0] : vector<1xf32>
%1087 = splat %1086 : vector<4xf32>
%1088 = vector.fma %1087, %640, %1084 : vector<4xf32>
%1089 = vector.extract_strided_slice %629 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1090 = vector.extract %1089[0] : vector<1xf32>
%1091 = splat %1090 : vector<4xf32>
%1092 = vector.fma %1091, %633, %arg17 : vector<4xf32>
%1093 = vector.extract_strided_slice %629 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1094 = vector.extract %1093[0] : vector<1xf32>
%1095 = splat %1094 : vector<4xf32>
%1096 = vector.fma %1095, %634, %1092 : vector<4xf32>
%1097 = vector.extract_strided_slice %629 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1098 = vector.extract %1097[0] : vector<1xf32>
%1099 = splat %1098 : vector<4xf32>
%1100 = vector.fma %1099, %635, %1096 : vector<4xf32>
%1101 = vector.extract_strided_slice %629 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1102 = vector.extract %1101[0] : vector<1xf32>
%1103 = splat %1102 : vector<4xf32>
%1104 = vector.fma %1103, %636, %1100 : vector<4xf32>
%1105 = vector.extract_strided_slice %630 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1106 = vector.extract %1105[0] : vector<1xf32>
%1107 = splat %1106 : vector<4xf32>
%1108 = vector.fma %1107, %637, %1104 : vector<4xf32>
%1109 = vector.extract_strided_slice %630 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1110 = vector.extract %1109[0] : vector<1xf32>
%1111 = splat %1110 : vector<4xf32>
%1112 = vector.fma %1111, %638, %1108 : vector<4xf32>
%1113 = vector.extract_strided_slice %630 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1114 = vector.extract %1113[0] : vector<1xf32>
%1115 = splat %1114 : vector<4xf32>
%1116 = vector.fma %1115, %639, %1112 : vector<4xf32>
%1117 = vector.extract_strided_slice %630 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1118 = vector.extract %1117[0] : vector<1xf32>
%1119 = splat %1118 : vector<4xf32>
%1120 = vector.fma %1119, %640, %1116 : vector<4xf32>
%1121 = vector.extract_strided_slice %631 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1122 = vector.extract %1121[0] : vector<1xf32>
%1123 = splat %1122 : vector<4xf32>
%1124 = vector.fma %1123, %633, %arg18 : vector<4xf32>
%1125 = vector.extract_strided_slice %631 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1126 = vector.extract %1125[0] : vector<1xf32>
%1127 = splat %1126 : vector<4xf32>
%1128 = vector.fma %1127, %634, %1124 : vector<4xf32>
%1129 = vector.extract_strided_slice %631 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1130 = vector.extract %1129[0] : vector<1xf32>
%1131 = splat %1130 : vector<4xf32>
%1132 = vector.fma %1131, %635, %1128 : vector<4xf32>
%1133 = vector.extract_strided_slice %631 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1134 = vector.extract %1133[0] : vector<1xf32>
%1135 = splat %1134 : vector<4xf32>
%1136 = vector.fma %1135, %636, %1132 : vector<4xf32>
%1137 = vector.extract_strided_slice %632 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1138 = vector.extract %1137[0] : vector<1xf32>
%1139 = splat %1138 : vector<4xf32>
%1140 = vector.fma %1139, %637, %1136 : vector<4xf32>
%1141 = vector.extract_strided_slice %632 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1142 = vector.extract %1141[0] : vector<1xf32>
%1143 = splat %1142 : vector<4xf32>
%1144 = vector.fma %1143, %638, %1140 : vector<4xf32>
%1145 = vector.extract_strided_slice %632 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1146 = vector.extract %1145[0] : vector<1xf32>
%1147 = splat %1146 : vector<4xf32>
%1148 = vector.fma %1147, %639, %1144 : vector<4xf32>
%1149 = vector.extract_strided_slice %632 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1150 = vector.extract %1149[0] : vector<1xf32>
%1151 = splat %1150 : vector<4xf32>
%1152 = vector.fma %1151, %640, %1148 : vector<4xf32>
%1153 = addi %arg2, %c8 : index
%1154 = memref.subview %39[0, %1153] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%1155 = addi %arg2, %c8 : index
%1156 = memref.subview %40[%1155, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%1157 = vector.transfer_read %1154[%32, %33], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%1158 = vector.transfer_read %1156[%34, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%1159 = vector.transfer_read %1156[%36, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
scf.yield %672, %704, %736, %768, %800, %832, %864, %896, %928, %960, %992, %1024, %1056, %1088, %1120, %1152, %1157, %1158, %1159 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
gpu.barrier
vector.transfer_write %48#16, %30[%32, %33] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %48#17, %31[%34, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %48#18, %31[%36, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%49 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%50 = vector.transfer_read %37[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%51 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%52 = vector.transfer_read %37[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%53 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%54 = vector.transfer_read %37[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%55 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%56 = vector.transfer_read %37[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%57 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%58 = vector.transfer_read %37[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%59 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%60 = vector.transfer_read %37[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%61 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%62 = vector.transfer_read %37[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%63 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%64 = vector.transfer_read %37[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%65 = vector.transfer_read %37[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%66 = vector.transfer_read %37[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%67 = vector.transfer_read %37[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%68 = vector.transfer_read %37[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%69 = vector.transfer_read %37[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%70 = vector.transfer_read %37[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%71 = vector.transfer_read %37[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%72 = vector.transfer_read %37[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%73 = vector.transfer_read %37[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%74 = vector.transfer_read %37[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%75 = vector.transfer_read %37[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%76 = vector.transfer_read %37[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%77 = vector.transfer_read %37[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%78 = vector.transfer_read %37[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%79 = vector.transfer_read %37[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%80 = vector.transfer_read %37[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%81 = vector.transfer_read %38[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%82 = vector.transfer_read %38[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%83 = vector.transfer_read %38[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%84 = vector.transfer_read %38[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%85 = vector.transfer_read %38[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%86 = vector.transfer_read %38[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%87 = vector.transfer_read %38[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%88 = vector.transfer_read %38[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%89 = vector.extract_strided_slice %49 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%90 = vector.extract %89[0] : vector<1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %81, %48#0 : vector<4xf32>
%93 = vector.extract_strided_slice %49 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%94 = vector.extract %93[0] : vector<1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.fma %95, %82, %92 : vector<4xf32>
%97 = vector.extract_strided_slice %49 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%98 = vector.extract %97[0] : vector<1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.fma %99, %83, %96 : vector<4xf32>
%101 = vector.extract_strided_slice %49 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%102 = vector.extract %101[0] : vector<1xf32>
%103 = splat %102 : vector<4xf32>
%104 = vector.fma %103, %84, %100 : vector<4xf32>
%105 = vector.extract_strided_slice %50 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%106 = vector.extract %105[0] : vector<1xf32>
%107 = splat %106 : vector<4xf32>
%108 = vector.fma %107, %85, %104 : vector<4xf32>
%109 = vector.extract_strided_slice %50 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%110 = vector.extract %109[0] : vector<1xf32>
%111 = splat %110 : vector<4xf32>
%112 = vector.fma %111, %86, %108 : vector<4xf32>
%113 = vector.extract_strided_slice %50 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%114 = vector.extract %113[0] : vector<1xf32>
%115 = splat %114 : vector<4xf32>
%116 = vector.fma %115, %87, %112 : vector<4xf32>
%117 = vector.extract_strided_slice %50 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %88, %116 : vector<4xf32>
%121 = vector.extract_strided_slice %51 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%122 = vector.extract %121[0] : vector<1xf32>
%123 = splat %122 : vector<4xf32>
%124 = vector.fma %123, %81, %48#1 : vector<4xf32>
%125 = vector.extract_strided_slice %51 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%126 = vector.extract %125[0] : vector<1xf32>
%127 = splat %126 : vector<4xf32>
%128 = vector.fma %127, %82, %124 : vector<4xf32>
%129 = vector.extract_strided_slice %51 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%130 = vector.extract %129[0] : vector<1xf32>
%131 = splat %130 : vector<4xf32>
%132 = vector.fma %131, %83, %128 : vector<4xf32>
%133 = vector.extract_strided_slice %51 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%134 = vector.extract %133[0] : vector<1xf32>
%135 = splat %134 : vector<4xf32>
%136 = vector.fma %135, %84, %132 : vector<4xf32>
%137 = vector.extract_strided_slice %52 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%138 = vector.extract %137[0] : vector<1xf32>
%139 = splat %138 : vector<4xf32>
%140 = vector.fma %139, %85, %136 : vector<4xf32>
%141 = vector.extract_strided_slice %52 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%142 = vector.extract %141[0] : vector<1xf32>
%143 = splat %142 : vector<4xf32>
%144 = vector.fma %143, %86, %140 : vector<4xf32>
%145 = vector.extract_strided_slice %52 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%146 = vector.extract %145[0] : vector<1xf32>
%147 = splat %146 : vector<4xf32>
%148 = vector.fma %147, %87, %144 : vector<4xf32>
%149 = vector.extract_strided_slice %52 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%150 = vector.extract %149[0] : vector<1xf32>
%151 = splat %150 : vector<4xf32>
%152 = vector.fma %151, %88, %148 : vector<4xf32>
%153 = vector.extract_strided_slice %53 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%154 = vector.extract %153[0] : vector<1xf32>
%155 = splat %154 : vector<4xf32>
%156 = vector.fma %155, %81, %48#2 : vector<4xf32>
%157 = vector.extract_strided_slice %53 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%158 = vector.extract %157[0] : vector<1xf32>
%159 = splat %158 : vector<4xf32>
%160 = vector.fma %159, %82, %156 : vector<4xf32>
%161 = vector.extract_strided_slice %53 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%162 = vector.extract %161[0] : vector<1xf32>
%163 = splat %162 : vector<4xf32>
%164 = vector.fma %163, %83, %160 : vector<4xf32>
%165 = vector.extract_strided_slice %53 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%166 = vector.extract %165[0] : vector<1xf32>
%167 = splat %166 : vector<4xf32>
%168 = vector.fma %167, %84, %164 : vector<4xf32>
%169 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%170 = vector.extract %169[0] : vector<1xf32>
%171 = splat %170 : vector<4xf32>
%172 = vector.fma %171, %85, %168 : vector<4xf32>
%173 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%174 = vector.extract %173[0] : vector<1xf32>
%175 = splat %174 : vector<4xf32>
%176 = vector.fma %175, %86, %172 : vector<4xf32>
%177 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%178 = vector.extract %177[0] : vector<1xf32>
%179 = splat %178 : vector<4xf32>
%180 = vector.fma %179, %87, %176 : vector<4xf32>
%181 = vector.extract_strided_slice %54 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%182 = vector.extract %181[0] : vector<1xf32>
%183 = splat %182 : vector<4xf32>
%184 = vector.fma %183, %88, %180 : vector<4xf32>
%185 = vector.extract_strided_slice %55 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%186 = vector.extract %185[0] : vector<1xf32>
%187 = splat %186 : vector<4xf32>
%188 = vector.fma %187, %81, %48#3 : vector<4xf32>
%189 = vector.extract_strided_slice %55 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%190 = vector.extract %189[0] : vector<1xf32>
%191 = splat %190 : vector<4xf32>
%192 = vector.fma %191, %82, %188 : vector<4xf32>
%193 = vector.extract_strided_slice %55 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%194 = vector.extract %193[0] : vector<1xf32>
%195 = splat %194 : vector<4xf32>
%196 = vector.fma %195, %83, %192 : vector<4xf32>
%197 = vector.extract_strided_slice %55 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%198 = vector.extract %197[0] : vector<1xf32>
%199 = splat %198 : vector<4xf32>
%200 = vector.fma %199, %84, %196 : vector<4xf32>
%201 = vector.extract_strided_slice %56 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%202 = vector.extract %201[0] : vector<1xf32>
%203 = splat %202 : vector<4xf32>
%204 = vector.fma %203, %85, %200 : vector<4xf32>
%205 = vector.extract_strided_slice %56 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%206 = vector.extract %205[0] : vector<1xf32>
%207 = splat %206 : vector<4xf32>
%208 = vector.fma %207, %86, %204 : vector<4xf32>
%209 = vector.extract_strided_slice %56 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%210 = vector.extract %209[0] : vector<1xf32>
%211 = splat %210 : vector<4xf32>
%212 = vector.fma %211, %87, %208 : vector<4xf32>
%213 = vector.extract_strided_slice %56 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%214 = vector.extract %213[0] : vector<1xf32>
%215 = splat %214 : vector<4xf32>
%216 = vector.fma %215, %88, %212 : vector<4xf32>
%217 = vector.extract_strided_slice %57 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%218 = vector.extract %217[0] : vector<1xf32>
%219 = splat %218 : vector<4xf32>
%220 = vector.fma %219, %81, %48#4 : vector<4xf32>
%221 = vector.extract_strided_slice %57 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%222 = vector.extract %221[0] : vector<1xf32>
%223 = splat %222 : vector<4xf32>
%224 = vector.fma %223, %82, %220 : vector<4xf32>
%225 = vector.extract_strided_slice %57 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%226 = vector.extract %225[0] : vector<1xf32>
%227 = splat %226 : vector<4xf32>
%228 = vector.fma %227, %83, %224 : vector<4xf32>
%229 = vector.extract_strided_slice %57 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%230 = vector.extract %229[0] : vector<1xf32>
%231 = splat %230 : vector<4xf32>
%232 = vector.fma %231, %84, %228 : vector<4xf32>
%233 = vector.extract_strided_slice %58 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%234 = vector.extract %233[0] : vector<1xf32>
%235 = splat %234 : vector<4xf32>
%236 = vector.fma %235, %85, %232 : vector<4xf32>
%237 = vector.extract_strided_slice %58 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%238 = vector.extract %237[0] : vector<1xf32>
%239 = splat %238 : vector<4xf32>
%240 = vector.fma %239, %86, %236 : vector<4xf32>
%241 = vector.extract_strided_slice %58 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%242 = vector.extract %241[0] : vector<1xf32>
%243 = splat %242 : vector<4xf32>
%244 = vector.fma %243, %87, %240 : vector<4xf32>
%245 = vector.extract_strided_slice %58 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%246 = vector.extract %245[0] : vector<1xf32>
%247 = splat %246 : vector<4xf32>
%248 = vector.fma %247, %88, %244 : vector<4xf32>
%249 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%250 = vector.extract %249[0] : vector<1xf32>
%251 = splat %250 : vector<4xf32>
%252 = vector.fma %251, %81, %48#5 : vector<4xf32>
%253 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%254 = vector.extract %253[0] : vector<1xf32>
%255 = splat %254 : vector<4xf32>
%256 = vector.fma %255, %82, %252 : vector<4xf32>
%257 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%258 = vector.extract %257[0] : vector<1xf32>
%259 = splat %258 : vector<4xf32>
%260 = vector.fma %259, %83, %256 : vector<4xf32>
%261 = vector.extract_strided_slice %59 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%262 = vector.extract %261[0] : vector<1xf32>
%263 = splat %262 : vector<4xf32>
%264 = vector.fma %263, %84, %260 : vector<4xf32>
%265 = vector.extract_strided_slice %60 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%266 = vector.extract %265[0] : vector<1xf32>
%267 = splat %266 : vector<4xf32>
%268 = vector.fma %267, %85, %264 : vector<4xf32>
%269 = vector.extract_strided_slice %60 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%270 = vector.extract %269[0] : vector<1xf32>
%271 = splat %270 : vector<4xf32>
%272 = vector.fma %271, %86, %268 : vector<4xf32>
%273 = vector.extract_strided_slice %60 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%274 = vector.extract %273[0] : vector<1xf32>
%275 = splat %274 : vector<4xf32>
%276 = vector.fma %275, %87, %272 : vector<4xf32>
%277 = vector.extract_strided_slice %60 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%278 = vector.extract %277[0] : vector<1xf32>
%279 = splat %278 : vector<4xf32>
%280 = vector.fma %279, %88, %276 : vector<4xf32>
%281 = vector.extract_strided_slice %61 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%282 = vector.extract %281[0] : vector<1xf32>
%283 = splat %282 : vector<4xf32>
%284 = vector.fma %283, %81, %48#6 : vector<4xf32>
%285 = vector.extract_strided_slice %61 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%286 = vector.extract %285[0] : vector<1xf32>
%287 = splat %286 : vector<4xf32>
%288 = vector.fma %287, %82, %284 : vector<4xf32>
%289 = vector.extract_strided_slice %61 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%290 = vector.extract %289[0] : vector<1xf32>
%291 = splat %290 : vector<4xf32>
%292 = vector.fma %291, %83, %288 : vector<4xf32>
%293 = vector.extract_strided_slice %61 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%294 = vector.extract %293[0] : vector<1xf32>
%295 = splat %294 : vector<4xf32>
%296 = vector.fma %295, %84, %292 : vector<4xf32>
%297 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%298 = vector.extract %297[0] : vector<1xf32>
%299 = splat %298 : vector<4xf32>
%300 = vector.fma %299, %85, %296 : vector<4xf32>
%301 = vector.extract_strided_slice %62 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%302 = vector.extract %301[0] : vector<1xf32>
%303 = splat %302 : vector<4xf32>
%304 = vector.fma %303, %86, %300 : vector<4xf32>
%305 = vector.extract_strided_slice %62 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%306 = vector.extract %305[0] : vector<1xf32>
%307 = splat %306 : vector<4xf32>
%308 = vector.fma %307, %87, %304 : vector<4xf32>
%309 = vector.extract_strided_slice %62 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%310 = vector.extract %309[0] : vector<1xf32>
%311 = splat %310 : vector<4xf32>
%312 = vector.fma %311, %88, %308 : vector<4xf32>
%313 = vector.extract_strided_slice %63 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%314 = vector.extract %313[0] : vector<1xf32>
%315 = splat %314 : vector<4xf32>
%316 = vector.fma %315, %81, %48#7 : vector<4xf32>
%317 = vector.extract_strided_slice %63 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%318 = vector.extract %317[0] : vector<1xf32>
%319 = splat %318 : vector<4xf32>
%320 = vector.fma %319, %82, %316 : vector<4xf32>
%321 = vector.extract_strided_slice %63 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%322 = vector.extract %321[0] : vector<1xf32>
%323 = splat %322 : vector<4xf32>
%324 = vector.fma %323, %83, %320 : vector<4xf32>
%325 = vector.extract_strided_slice %63 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%326 = vector.extract %325[0] : vector<1xf32>
%327 = splat %326 : vector<4xf32>
%328 = vector.fma %327, %84, %324 : vector<4xf32>
%329 = vector.extract_strided_slice %64 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%330 = vector.extract %329[0] : vector<1xf32>
%331 = splat %330 : vector<4xf32>
%332 = vector.fma %331, %85, %328 : vector<4xf32>
%333 = vector.extract_strided_slice %64 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%334 = vector.extract %333[0] : vector<1xf32>
%335 = splat %334 : vector<4xf32>
%336 = vector.fma %335, %86, %332 : vector<4xf32>
%337 = vector.extract_strided_slice %64 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%338 = vector.extract %337[0] : vector<1xf32>
%339 = splat %338 : vector<4xf32>
%340 = vector.fma %339, %87, %336 : vector<4xf32>
%341 = vector.extract_strided_slice %64 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%342 = vector.extract %341[0] : vector<1xf32>
%343 = splat %342 : vector<4xf32>
%344 = vector.fma %343, %88, %340 : vector<4xf32>
%345 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%346 = vector.extract %345[0] : vector<1xf32>
%347 = splat %346 : vector<4xf32>
%348 = vector.fma %347, %81, %48#8 : vector<4xf32>
%349 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%350 = vector.extract %349[0] : vector<1xf32>
%351 = splat %350 : vector<4xf32>
%352 = vector.fma %351, %82, %348 : vector<4xf32>
%353 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%354 = vector.extract %353[0] : vector<1xf32>
%355 = splat %354 : vector<4xf32>
%356 = vector.fma %355, %83, %352 : vector<4xf32>
%357 = vector.extract_strided_slice %65 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%358 = vector.extract %357[0] : vector<1xf32>
%359 = splat %358 : vector<4xf32>
%360 = vector.fma %359, %84, %356 : vector<4xf32>
%361 = vector.extract_strided_slice %66 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%362 = vector.extract %361[0] : vector<1xf32>
%363 = splat %362 : vector<4xf32>
%364 = vector.fma %363, %85, %360 : vector<4xf32>
%365 = vector.extract_strided_slice %66 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%366 = vector.extract %365[0] : vector<1xf32>
%367 = splat %366 : vector<4xf32>
%368 = vector.fma %367, %86, %364 : vector<4xf32>
%369 = vector.extract_strided_slice %66 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%370 = vector.extract %369[0] : vector<1xf32>
%371 = splat %370 : vector<4xf32>
%372 = vector.fma %371, %87, %368 : vector<4xf32>
%373 = vector.extract_strided_slice %66 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%374 = vector.extract %373[0] : vector<1xf32>
%375 = splat %374 : vector<4xf32>
%376 = vector.fma %375, %88, %372 : vector<4xf32>
%377 = vector.extract_strided_slice %67 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%378 = vector.extract %377[0] : vector<1xf32>
%379 = splat %378 : vector<4xf32>
%380 = vector.fma %379, %81, %48#9 : vector<4xf32>
%381 = vector.extract_strided_slice %67 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%382 = vector.extract %381[0] : vector<1xf32>
%383 = splat %382 : vector<4xf32>
%384 = vector.fma %383, %82, %380 : vector<4xf32>
%385 = vector.extract_strided_slice %67 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%386 = vector.extract %385[0] : vector<1xf32>
%387 = splat %386 : vector<4xf32>
%388 = vector.fma %387, %83, %384 : vector<4xf32>
%389 = vector.extract_strided_slice %67 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%390 = vector.extract %389[0] : vector<1xf32>
%391 = splat %390 : vector<4xf32>
%392 = vector.fma %391, %84, %388 : vector<4xf32>
%393 = vector.extract_strided_slice %68 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%394 = vector.extract %393[0] : vector<1xf32>
%395 = splat %394 : vector<4xf32>
%396 = vector.fma %395, %85, %392 : vector<4xf32>
%397 = vector.extract_strided_slice %68 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%398 = vector.extract %397[0] : vector<1xf32>
%399 = splat %398 : vector<4xf32>
%400 = vector.fma %399, %86, %396 : vector<4xf32>
%401 = vector.extract_strided_slice %68 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%402 = vector.extract %401[0] : vector<1xf32>
%403 = splat %402 : vector<4xf32>
%404 = vector.fma %403, %87, %400 : vector<4xf32>
%405 = vector.extract_strided_slice %68 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%406 = vector.extract %405[0] : vector<1xf32>
%407 = splat %406 : vector<4xf32>
%408 = vector.fma %407, %88, %404 : vector<4xf32>
%409 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%410 = vector.extract %409[0] : vector<1xf32>
%411 = splat %410 : vector<4xf32>
%412 = vector.fma %411, %81, %48#10 : vector<4xf32>
%413 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%414 = vector.extract %413[0] : vector<1xf32>
%415 = splat %414 : vector<4xf32>
%416 = vector.fma %415, %82, %412 : vector<4xf32>
%417 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%418 = vector.extract %417[0] : vector<1xf32>
%419 = splat %418 : vector<4xf32>
%420 = vector.fma %419, %83, %416 : vector<4xf32>
%421 = vector.extract_strided_slice %69 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%422 = vector.extract %421[0] : vector<1xf32>
%423 = splat %422 : vector<4xf32>
%424 = vector.fma %423, %84, %420 : vector<4xf32>
%425 = vector.extract_strided_slice %70 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%426 = vector.extract %425[0] : vector<1xf32>
%427 = splat %426 : vector<4xf32>
%428 = vector.fma %427, %85, %424 : vector<4xf32>
%429 = vector.extract_strided_slice %70 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%430 = vector.extract %429[0] : vector<1xf32>
%431 = splat %430 : vector<4xf32>
%432 = vector.fma %431, %86, %428 : vector<4xf32>
%433 = vector.extract_strided_slice %70 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%434 = vector.extract %433[0] : vector<1xf32>
%435 = splat %434 : vector<4xf32>
%436 = vector.fma %435, %87, %432 : vector<4xf32>
%437 = vector.extract_strided_slice %70 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%438 = vector.extract %437[0] : vector<1xf32>
%439 = splat %438 : vector<4xf32>
%440 = vector.fma %439, %88, %436 : vector<4xf32>
%441 = vector.extract_strided_slice %71 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%442 = vector.extract %441[0] : vector<1xf32>
%443 = splat %442 : vector<4xf32>
%444 = vector.fma %443, %81, %48#11 : vector<4xf32>
%445 = vector.extract_strided_slice %71 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%446 = vector.extract %445[0] : vector<1xf32>
%447 = splat %446 : vector<4xf32>
%448 = vector.fma %447, %82, %444 : vector<4xf32>
%449 = vector.extract_strided_slice %71 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%450 = vector.extract %449[0] : vector<1xf32>
%451 = splat %450 : vector<4xf32>
%452 = vector.fma %451, %83, %448 : vector<4xf32>
%453 = vector.extract_strided_slice %71 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%454 = vector.extract %453[0] : vector<1xf32>
%455 = splat %454 : vector<4xf32>
%456 = vector.fma %455, %84, %452 : vector<4xf32>
%457 = vector.extract_strided_slice %72 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%458 = vector.extract %457[0] : vector<1xf32>
%459 = splat %458 : vector<4xf32>
%460 = vector.fma %459, %85, %456 : vector<4xf32>
%461 = vector.extract_strided_slice %72 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%462 = vector.extract %461[0] : vector<1xf32>
%463 = splat %462 : vector<4xf32>
%464 = vector.fma %463, %86, %460 : vector<4xf32>
%465 = vector.extract_strided_slice %72 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%466 = vector.extract %465[0] : vector<1xf32>
%467 = splat %466 : vector<4xf32>
%468 = vector.fma %467, %87, %464 : vector<4xf32>
%469 = vector.extract_strided_slice %72 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%470 = vector.extract %469[0] : vector<1xf32>
%471 = splat %470 : vector<4xf32>
%472 = vector.fma %471, %88, %468 : vector<4xf32>
%473 = vector.extract_strided_slice %73 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%474 = vector.extract %473[0] : vector<1xf32>
%475 = splat %474 : vector<4xf32>
%476 = vector.fma %475, %81, %48#12 : vector<4xf32>
%477 = vector.extract_strided_slice %73 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%478 = vector.extract %477[0] : vector<1xf32>
%479 = splat %478 : vector<4xf32>
%480 = vector.fma %479, %82, %476 : vector<4xf32>
%481 = vector.extract_strided_slice %73 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%482 = vector.extract %481[0] : vector<1xf32>
%483 = splat %482 : vector<4xf32>
%484 = vector.fma %483, %83, %480 : vector<4xf32>
%485 = vector.extract_strided_slice %73 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%486 = vector.extract %485[0] : vector<1xf32>
%487 = splat %486 : vector<4xf32>
%488 = vector.fma %487, %84, %484 : vector<4xf32>
%489 = vector.extract_strided_slice %74 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%490 = vector.extract %489[0] : vector<1xf32>
%491 = splat %490 : vector<4xf32>
%492 = vector.fma %491, %85, %488 : vector<4xf32>
%493 = vector.extract_strided_slice %74 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%494 = vector.extract %493[0] : vector<1xf32>
%495 = splat %494 : vector<4xf32>
%496 = vector.fma %495, %86, %492 : vector<4xf32>
%497 = vector.extract_strided_slice %74 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%498 = vector.extract %497[0] : vector<1xf32>
%499 = splat %498 : vector<4xf32>
%500 = vector.fma %499, %87, %496 : vector<4xf32>
%501 = vector.extract_strided_slice %74 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%502 = vector.extract %501[0] : vector<1xf32>
%503 = splat %502 : vector<4xf32>
%504 = vector.fma %503, %88, %500 : vector<4xf32>
%505 = vector.extract_strided_slice %75 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%506 = vector.extract %505[0] : vector<1xf32>
%507 = splat %506 : vector<4xf32>
%508 = vector.fma %507, %81, %48#13 : vector<4xf32>
%509 = vector.extract_strided_slice %75 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%510 = vector.extract %509[0] : vector<1xf32>
%511 = splat %510 : vector<4xf32>
%512 = vector.fma %511, %82, %508 : vector<4xf32>
%513 = vector.extract_strided_slice %75 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%514 = vector.extract %513[0] : vector<1xf32>
%515 = splat %514 : vector<4xf32>
%516 = vector.fma %515, %83, %512 : vector<4xf32>
%517 = vector.extract_strided_slice %75 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%518 = vector.extract %517[0] : vector<1xf32>
%519 = splat %518 : vector<4xf32>
%520 = vector.fma %519, %84, %516 : vector<4xf32>
%521 = vector.extract_strided_slice %76 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%522 = vector.extract %521[0] : vector<1xf32>
%523 = splat %522 : vector<4xf32>
%524 = vector.fma %523, %85, %520 : vector<4xf32>
%525 = vector.extract_strided_slice %76 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%526 = vector.extract %525[0] : vector<1xf32>
%527 = splat %526 : vector<4xf32>
%528 = vector.fma %527, %86, %524 : vector<4xf32>
%529 = vector.extract_strided_slice %76 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%530 = vector.extract %529[0] : vector<1xf32>
%531 = splat %530 : vector<4xf32>
%532 = vector.fma %531, %87, %528 : vector<4xf32>
%533 = vector.extract_strided_slice %76 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%534 = vector.extract %533[0] : vector<1xf32>
%535 = splat %534 : vector<4xf32>
%536 = vector.fma %535, %88, %532 : vector<4xf32>
%537 = vector.extract_strided_slice %77 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%538 = vector.extract %537[0] : vector<1xf32>
%539 = splat %538 : vector<4xf32>
%540 = vector.fma %539, %81, %48#14 : vector<4xf32>
%541 = vector.extract_strided_slice %77 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%542 = vector.extract %541[0] : vector<1xf32>
%543 = splat %542 : vector<4xf32>
%544 = vector.fma %543, %82, %540 : vector<4xf32>
%545 = vector.extract_strided_slice %77 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%546 = vector.extract %545[0] : vector<1xf32>
%547 = splat %546 : vector<4xf32>
%548 = vector.fma %547, %83, %544 : vector<4xf32>
%549 = vector.extract_strided_slice %77 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%550 = vector.extract %549[0] : vector<1xf32>
%551 = splat %550 : vector<4xf32>
%552 = vector.fma %551, %84, %548 : vector<4xf32>
%553 = vector.extract_strided_slice %78 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%554 = vector.extract %553[0] : vector<1xf32>
%555 = splat %554 : vector<4xf32>
%556 = vector.fma %555, %85, %552 : vector<4xf32>
%557 = vector.extract_strided_slice %78 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%558 = vector.extract %557[0] : vector<1xf32>
%559 = splat %558 : vector<4xf32>
%560 = vector.fma %559, %86, %556 : vector<4xf32>
%561 = vector.extract_strided_slice %78 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%562 = vector.extract %561[0] : vector<1xf32>
%563 = splat %562 : vector<4xf32>
%564 = vector.fma %563, %87, %560 : vector<4xf32>
%565 = vector.extract_strided_slice %78 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%566 = vector.extract %565[0] : vector<1xf32>
%567 = splat %566 : vector<4xf32>
%568 = vector.fma %567, %88, %564 : vector<4xf32>
%569 = vector.extract_strided_slice %79 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%570 = vector.extract %569[0] : vector<1xf32>
%571 = splat %570 : vector<4xf32>
%572 = vector.fma %571, %81, %48#15 : vector<4xf32>
%573 = vector.extract_strided_slice %79 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%574 = vector.extract %573[0] : vector<1xf32>
%575 = splat %574 : vector<4xf32>
%576 = vector.fma %575, %82, %572 : vector<4xf32>
%577 = vector.extract_strided_slice %79 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%578 = vector.extract %577[0] : vector<1xf32>
%579 = splat %578 : vector<4xf32>
%580 = vector.fma %579, %83, %576 : vector<4xf32>
%581 = vector.extract_strided_slice %79 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%582 = vector.extract %581[0] : vector<1xf32>
%583 = splat %582 : vector<4xf32>
%584 = vector.fma %583, %84, %580 : vector<4xf32>
%585 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%586 = vector.extract %585[0] : vector<1xf32>
%587 = splat %586 : vector<4xf32>
%588 = vector.fma %587, %85, %584 : vector<4xf32>
%589 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%590 = vector.extract %589[0] : vector<1xf32>
%591 = splat %590 : vector<4xf32>
%592 = vector.fma %591, %86, %588 : vector<4xf32>
%593 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%594 = vector.extract %593[0] : vector<1xf32>
%595 = splat %594 : vector<4xf32>
%596 = vector.fma %595, %87, %592 : vector<4xf32>
%597 = vector.extract_strided_slice %80 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%598 = vector.extract %597[0] : vector<1xf32>
%599 = splat %598 : vector<4xf32>
%600 = vector.fma %599, %88, %596 : vector<4xf32>
vector.transfer_write %600, %42[%c15, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %568, %42[%c14, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %536, %42[%c13, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %504, %42[%c12, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %472, %42[%c11, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %440, %42[%c10, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %408, %42[%c9, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %376, %42[%c8, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %344, %42[%c7, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %312, %42[%c6, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %280, %42[%c5, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %248, %42[%c4, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %216, %42[%c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %184, %42[%c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %152, %42[%c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %120, %42[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
return
}
// -----// IR Dump After LLVMGPULowerExecutableTarget //----- //
hal.executable.variant public @cuda_nvptx_fb, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
hal.executable.entry_point public @_large_aligned_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = 4 : i32, workloadPerWorkgroup = [128, 64]}, workgroup_size = [32 : index, 4 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
memref.global "private" @__shared_memory___0 : memref<8x128xf32, 3>
memref.global "private" @__shared_memory__ : memref<64x8xf32, 3>
func @_large_aligned_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c15 = constant 15 : index
%c14 = constant 14 : index
%c13 = constant 13 : index
%c12 = constant 12 : index
%c11 = constant 11 : index
%c10 = constant 10 : index
%c9 = constant 9 : index
%c7 = constant 7 : index
%c6 = constant 6 : index
%c5 = constant 5 : index
%c4 = constant 4 : index
%c3 = constant 3 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst_0 = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1016 = constant 1016 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
%12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%1]
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%0]
%14 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%19 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%20 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%21 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%22 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%23 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%24 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%25 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%26 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%27 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%28 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%29 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%30 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%31 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%32 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 64 + s0 floordiv 2)>()[%0, %1, %2]
%33 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 2) * 8)>()[%0]
%34 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2]
%35 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 32) * 128)>()[%0]
%36 = affine.apply affine_map<()[s0, s1, s2] -> (s1 + s2 * 4 + s0 floordiv 32 + 4)>()[%0, %1, %2]
%37 = memref.subview %30[%12, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%38 = memref.subview %31[0, %13] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
scf.for %arg0 = %8 to %c2048 step %9 {
%39 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
scf.for %arg1 = %10 to %c512 step %11 {
%40 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%41 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%42 = memref.subview %41[%12, %13] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%43 = memref.subview %39[0, %c0] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%44 = memref.subview %40[%c0, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%45 = vector.transfer_read %43[%32, %33], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%46 = vector.transfer_read %44[%34, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%47 = vector.transfer_read %44[%36, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%48:19 = scf.for %arg2 = %c0 to %c1016 step %c8 iter_args(%arg3 = %14, %arg4 = %15, %arg5 = %16, %arg6 = %17, %arg7 = %18, %arg8 = %19, %arg9 = %20, %arg10 = %21, %arg11 = %22, %arg12 = %23, %arg13 = %24, %arg14 = %25, %arg15 = %26, %arg16 = %27, %arg17 = %28, %arg18 = %29, %arg19 = %45, %arg20 = %46, %arg21 = %47) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
gpu.barrier
vector.transfer_write %arg19, %30[%32, %33] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %arg20, %31[%34, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %arg21, %31[%36, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%601 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%602 = vector.transfer_read %37[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%603 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%604 = vector.transfer_read %37[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%605 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%606 = vector.transfer_read %37[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%607 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%608 = vector.transfer_read %37[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%609 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%610 = vector.transfer_read %37[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%611 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%612 = vector.transfer_read %37[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%613 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%614 = vector.transfer_read %37[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%615 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%616 = vector.transfer_read %37[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%617 = vector.transfer_read %37[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%618 = vector.transfer_read %37[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%619 = vector.transfer_read %37[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%620 = vector.transfer_read %37[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%621 = vector.transfer_read %37[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%622 = vector.transfer_read %37[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%623 = vector.transfer_read %37[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%624 = vector.transfer_read %37[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%625 = vector.transfer_read %37[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%626 = vector.transfer_read %37[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%627 = vector.transfer_read %37[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%628 = vector.transfer_read %37[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%629 = vector.transfer_read %37[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%630 = vector.transfer_read %37[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%631 = vector.transfer_read %37[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%632 = vector.transfer_read %37[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%633 = vector.transfer_read %38[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%634 = vector.transfer_read %38[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%635 = vector.transfer_read %38[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%636 = vector.transfer_read %38[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%637 = vector.transfer_read %38[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%638 = vector.transfer_read %38[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%639 = vector.transfer_read %38[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%640 = vector.transfer_read %38[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%641 = vector.extract_strided_slice %601 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%642 = vector.extract %641[0] : vector<1xf32>
%643 = splat %642 : vector<4xf32>
%644 = vector.fma %643, %633, %arg3 : vector<4xf32>
%645 = vector.extract_strided_slice %601 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%646 = vector.extract %645[0] : vector<1xf32>
%647 = splat %646 : vector<4xf32>
%648 = vector.fma %647, %634, %644 : vector<4xf32>
%649 = vector.extract_strided_slice %601 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%650 = vector.extract %649[0] : vector<1xf32>
%651 = splat %650 : vector<4xf32>
%652 = vector.fma %651, %635, %648 : vector<4xf32>
%653 = vector.extract_strided_slice %601 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%654 = vector.extract %653[0] : vector<1xf32>
%655 = splat %654 : vector<4xf32>
%656 = vector.fma %655, %636, %652 : vector<4xf32>
%657 = vector.extract_strided_slice %602 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%658 = vector.extract %657[0] : vector<1xf32>
%659 = splat %658 : vector<4xf32>
%660 = vector.fma %659, %637, %656 : vector<4xf32>
%661 = vector.extract_strided_slice %602 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%662 = vector.extract %661[0] : vector<1xf32>
%663 = splat %662 : vector<4xf32>
%664 = vector.fma %663, %638, %660 : vector<4xf32>
%665 = vector.extract_strided_slice %602 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%666 = vector.extract %665[0] : vector<1xf32>
%667 = splat %666 : vector<4xf32>
%668 = vector.fma %667, %639, %664 : vector<4xf32>
%669 = vector.extract_strided_slice %602 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%670 = vector.extract %669[0] : vector<1xf32>
%671 = splat %670 : vector<4xf32>
%672 = vector.fma %671, %640, %668 : vector<4xf32>
%673 = vector.extract_strided_slice %603 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%674 = vector.extract %673[0] : vector<1xf32>
%675 = splat %674 : vector<4xf32>
%676 = vector.fma %675, %633, %arg4 : vector<4xf32>
%677 = vector.extract_strided_slice %603 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%678 = vector.extract %677[0] : vector<1xf32>
%679 = splat %678 : vector<4xf32>
%680 = vector.fma %679, %634, %676 : vector<4xf32>
%681 = vector.extract_strided_slice %603 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%682 = vector.extract %681[0] : vector<1xf32>
%683 = splat %682 : vector<4xf32>
%684 = vector.fma %683, %635, %680 : vector<4xf32>
%685 = vector.extract_strided_slice %603 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%686 = vector.extract %685[0] : vector<1xf32>
%687 = splat %686 : vector<4xf32>
%688 = vector.fma %687, %636, %684 : vector<4xf32>
%689 = vector.extract_strided_slice %604 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%690 = vector.extract %689[0] : vector<1xf32>
%691 = splat %690 : vector<4xf32>
%692 = vector.fma %691, %637, %688 : vector<4xf32>
%693 = vector.extract_strided_slice %604 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%694 = vector.extract %693[0] : vector<1xf32>
%695 = splat %694 : vector<4xf32>
%696 = vector.fma %695, %638, %692 : vector<4xf32>
%697 = vector.extract_strided_slice %604 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%698 = vector.extract %697[0] : vector<1xf32>
%699 = splat %698 : vector<4xf32>
%700 = vector.fma %699, %639, %696 : vector<4xf32>
%701 = vector.extract_strided_slice %604 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%702 = vector.extract %701[0] : vector<1xf32>
%703 = splat %702 : vector<4xf32>
%704 = vector.fma %703, %640, %700 : vector<4xf32>
%705 = vector.extract_strided_slice %605 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%706 = vector.extract %705[0] : vector<1xf32>
%707 = splat %706 : vector<4xf32>
%708 = vector.fma %707, %633, %arg5 : vector<4xf32>
%709 = vector.extract_strided_slice %605 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%710 = vector.extract %709[0] : vector<1xf32>
%711 = splat %710 : vector<4xf32>
%712 = vector.fma %711, %634, %708 : vector<4xf32>
%713 = vector.extract_strided_slice %605 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%714 = vector.extract %713[0] : vector<1xf32>
%715 = splat %714 : vector<4xf32>
%716 = vector.fma %715, %635, %712 : vector<4xf32>
%717 = vector.extract_strided_slice %605 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%718 = vector.extract %717[0] : vector<1xf32>
%719 = splat %718 : vector<4xf32>
%720 = vector.fma %719, %636, %716 : vector<4xf32>
%721 = vector.extract_strided_slice %606 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%722 = vector.extract %721[0] : vector<1xf32>
%723 = splat %722 : vector<4xf32>
%724 = vector.fma %723, %637, %720 : vector<4xf32>
%725 = vector.extract_strided_slice %606 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%726 = vector.extract %725[0] : vector<1xf32>
%727 = splat %726 : vector<4xf32>
%728 = vector.fma %727, %638, %724 : vector<4xf32>
%729 = vector.extract_strided_slice %606 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%730 = vector.extract %729[0] : vector<1xf32>
%731 = splat %730 : vector<4xf32>
%732 = vector.fma %731, %639, %728 : vector<4xf32>
%733 = vector.extract_strided_slice %606 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%734 = vector.extract %733[0] : vector<1xf32>
%735 = splat %734 : vector<4xf32>
%736 = vector.fma %735, %640, %732 : vector<4xf32>
%737 = vector.extract_strided_slice %607 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%738 = vector.extract %737[0] : vector<1xf32>
%739 = splat %738 : vector<4xf32>
%740 = vector.fma %739, %633, %arg6 : vector<4xf32>
%741 = vector.extract_strided_slice %607 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%742 = vector.extract %741[0] : vector<1xf32>
%743 = splat %742 : vector<4xf32>
%744 = vector.fma %743, %634, %740 : vector<4xf32>
%745 = vector.extract_strided_slice %607 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%746 = vector.extract %745[0] : vector<1xf32>
%747 = splat %746 : vector<4xf32>
%748 = vector.fma %747, %635, %744 : vector<4xf32>
%749 = vector.extract_strided_slice %607 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%750 = vector.extract %749[0] : vector<1xf32>
%751 = splat %750 : vector<4xf32>
%752 = vector.fma %751, %636, %748 : vector<4xf32>
%753 = vector.extract_strided_slice %608 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%754 = vector.extract %753[0] : vector<1xf32>
%755 = splat %754 : vector<4xf32>
%756 = vector.fma %755, %637, %752 : vector<4xf32>
%757 = vector.extract_strided_slice %608 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%758 = vector.extract %757[0] : vector<1xf32>
%759 = splat %758 : vector<4xf32>
%760 = vector.fma %759, %638, %756 : vector<4xf32>
%761 = vector.extract_strided_slice %608 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%762 = vector.extract %761[0] : vector<1xf32>
%763 = splat %762 : vector<4xf32>
%764 = vector.fma %763, %639, %760 : vector<4xf32>
%765 = vector.extract_strided_slice %608 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%766 = vector.extract %765[0] : vector<1xf32>
%767 = splat %766 : vector<4xf32>
%768 = vector.fma %767, %640, %764 : vector<4xf32>
%769 = vector.extract_strided_slice %609 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%770 = vector.extract %769[0] : vector<1xf32>
%771 = splat %770 : vector<4xf32>
%772 = vector.fma %771, %633, %arg7 : vector<4xf32>
%773 = vector.extract_strided_slice %609 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%774 = vector.extract %773[0] : vector<1xf32>
%775 = splat %774 : vector<4xf32>
%776 = vector.fma %775, %634, %772 : vector<4xf32>
%777 = vector.extract_strided_slice %609 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%778 = vector.extract %777[0] : vector<1xf32>
%779 = splat %778 : vector<4xf32>
%780 = vector.fma %779, %635, %776 : vector<4xf32>
%781 = vector.extract_strided_slice %609 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%782 = vector.extract %781[0] : vector<1xf32>
%783 = splat %782 : vector<4xf32>
%784 = vector.fma %783, %636, %780 : vector<4xf32>
%785 = vector.extract_strided_slice %610 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%786 = vector.extract %785[0] : vector<1xf32>
%787 = splat %786 : vector<4xf32>
%788 = vector.fma %787, %637, %784 : vector<4xf32>
%789 = vector.extract_strided_slice %610 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%790 = vector.extract %789[0] : vector<1xf32>
%791 = splat %790 : vector<4xf32>
%792 = vector.fma %791, %638, %788 : vector<4xf32>
%793 = vector.extract_strided_slice %610 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%794 = vector.extract %793[0] : vector<1xf32>
%795 = splat %794 : vector<4xf32>
%796 = vector.fma %795, %639, %792 : vector<4xf32>
%797 = vector.extract_strided_slice %610 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%798 = vector.extract %797[0] : vector<1xf32>
%799 = splat %798 : vector<4xf32>
%800 = vector.fma %799, %640, %796 : vector<4xf32>
%801 = vector.extract_strided_slice %611 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%802 = vector.extract %801[0] : vector<1xf32>
%803 = splat %802 : vector<4xf32>
%804 = vector.fma %803, %633, %arg8 : vector<4xf32>
%805 = vector.extract_strided_slice %611 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%806 = vector.extract %805[0] : vector<1xf32>
%807 = splat %806 : vector<4xf32>
%808 = vector.fma %807, %634, %804 : vector<4xf32>
%809 = vector.extract_strided_slice %611 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%810 = vector.extract %809[0] : vector<1xf32>
%811 = splat %810 : vector<4xf32>
%812 = vector.fma %811, %635, %808 : vector<4xf32>
%813 = vector.extract_strided_slice %611 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%814 = vector.extract %813[0] : vector<1xf32>
%815 = splat %814 : vector<4xf32>
%816 = vector.fma %815, %636, %812 : vector<4xf32>
%817 = vector.extract_strided_slice %612 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%818 = vector.extract %817[0] : vector<1xf32>
%819 = splat %818 : vector<4xf32>
%820 = vector.fma %819, %637, %816 : vector<4xf32>
%821 = vector.extract_strided_slice %612 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%822 = vector.extract %821[0] : vector<1xf32>
%823 = splat %822 : vector<4xf32>
%824 = vector.fma %823, %638, %820 : vector<4xf32>
%825 = vector.extract_strided_slice %612 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%826 = vector.extract %825[0] : vector<1xf32>
%827 = splat %826 : vector<4xf32>
%828 = vector.fma %827, %639, %824 : vector<4xf32>
%829 = vector.extract_strided_slice %612 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%830 = vector.extract %829[0] : vector<1xf32>
%831 = splat %830 : vector<4xf32>
%832 = vector.fma %831, %640, %828 : vector<4xf32>
%833 = vector.extract_strided_slice %613 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%834 = vector.extract %833[0] : vector<1xf32>
%835 = splat %834 : vector<4xf32>
%836 = vector.fma %835, %633, %arg9 : vector<4xf32>
%837 = vector.extract_strided_slice %613 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%838 = vector.extract %837[0] : vector<1xf32>
%839 = splat %838 : vector<4xf32>
%840 = vector.fma %839, %634, %836 : vector<4xf32>
%841 = vector.extract_strided_slice %613 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%842 = vector.extract %841[0] : vector<1xf32>
%843 = splat %842 : vector<4xf32>
%844 = vector.fma %843, %635, %840 : vector<4xf32>
%845 = vector.extract_strided_slice %613 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%846 = vector.extract %845[0] : vector<1xf32>
%847 = splat %846 : vector<4xf32>
%848 = vector.fma %847, %636, %844 : vector<4xf32>
%849 = vector.extract_strided_slice %614 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%850 = vector.extract %849[0] : vector<1xf32>
%851 = splat %850 : vector<4xf32>
%852 = vector.fma %851, %637, %848 : vector<4xf32>
%853 = vector.extract_strided_slice %614 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%854 = vector.extract %853[0] : vector<1xf32>
%855 = splat %854 : vector<4xf32>
%856 = vector.fma %855, %638, %852 : vector<4xf32>
%857 = vector.extract_strided_slice %614 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%858 = vector.extract %857[0] : vector<1xf32>
%859 = splat %858 : vector<4xf32>
%860 = vector.fma %859, %639, %856 : vector<4xf32>
%861 = vector.extract_strided_slice %614 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%862 = vector.extract %861[0] : vector<1xf32>
%863 = splat %862 : vector<4xf32>
%864 = vector.fma %863, %640, %860 : vector<4xf32>
%865 = vector.extract_strided_slice %615 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%866 = vector.extract %865[0] : vector<1xf32>
%867 = splat %866 : vector<4xf32>
%868 = vector.fma %867, %633, %arg10 : vector<4xf32>
%869 = vector.extract_strided_slice %615 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%870 = vector.extract %869[0] : vector<1xf32>
%871 = splat %870 : vector<4xf32>
%872 = vector.fma %871, %634, %868 : vector<4xf32>
%873 = vector.extract_strided_slice %615 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%874 = vector.extract %873[0] : vector<1xf32>
%875 = splat %874 : vector<4xf32>
%876 = vector.fma %875, %635, %872 : vector<4xf32>
%877 = vector.extract_strided_slice %615 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%878 = vector.extract %877[0] : vector<1xf32>
%879 = splat %878 : vector<4xf32>
%880 = vector.fma %879, %636, %876 : vector<4xf32>
%881 = vector.extract_strided_slice %616 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%882 = vector.extract %881[0] : vector<1xf32>
%883 = splat %882 : vector<4xf32>
%884 = vector.fma %883, %637, %880 : vector<4xf32>
%885 = vector.extract_strided_slice %616 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%886 = vector.extract %885[0] : vector<1xf32>
%887 = splat %886 : vector<4xf32>
%888 = vector.fma %887, %638, %884 : vector<4xf32>
%889 = vector.extract_strided_slice %616 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%890 = vector.extract %889[0] : vector<1xf32>
%891 = splat %890 : vector<4xf32>
%892 = vector.fma %891, %639, %888 : vector<4xf32>
%893 = vector.extract_strided_slice %616 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%894 = vector.extract %893[0] : vector<1xf32>
%895 = splat %894 : vector<4xf32>
%896 = vector.fma %895, %640, %892 : vector<4xf32>
%897 = vector.extract_strided_slice %617 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%898 = vector.extract %897[0] : vector<1xf32>
%899 = splat %898 : vector<4xf32>
%900 = vector.fma %899, %633, %arg11 : vector<4xf32>
%901 = vector.extract_strided_slice %617 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%902 = vector.extract %901[0] : vector<1xf32>
%903 = splat %902 : vector<4xf32>
%904 = vector.fma %903, %634, %900 : vector<4xf32>
%905 = vector.extract_strided_slice %617 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%906 = vector.extract %905[0] : vector<1xf32>
%907 = splat %906 : vector<4xf32>
%908 = vector.fma %907, %635, %904 : vector<4xf32>
%909 = vector.extract_strided_slice %617 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%910 = vector.extract %909[0] : vector<1xf32>
%911 = splat %910 : vector<4xf32>
%912 = vector.fma %911, %636, %908 : vector<4xf32>
%913 = vector.extract_strided_slice %618 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%914 = vector.extract %913[0] : vector<1xf32>
%915 = splat %914 : vector<4xf32>
%916 = vector.fma %915, %637, %912 : vector<4xf32>
%917 = vector.extract_strided_slice %618 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%918 = vector.extract %917[0] : vector<1xf32>
%919 = splat %918 : vector<4xf32>
%920 = vector.fma %919, %638, %916 : vector<4xf32>
%921 = vector.extract_strided_slice %618 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%922 = vector.extract %921[0] : vector<1xf32>
%923 = splat %922 : vector<4xf32>
%924 = vector.fma %923, %639, %920 : vector<4xf32>
%925 = vector.extract_strided_slice %618 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%926 = vector.extract %925[0] : vector<1xf32>
%927 = splat %926 : vector<4xf32>
%928 = vector.fma %927, %640, %924 : vector<4xf32>
%929 = vector.extract_strided_slice %619 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%930 = vector.extract %929[0] : vector<1xf32>
%931 = splat %930 : vector<4xf32>
%932 = vector.fma %931, %633, %arg12 : vector<4xf32>
%933 = vector.extract_strided_slice %619 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%934 = vector.extract %933[0] : vector<1xf32>
%935 = splat %934 : vector<4xf32>
%936 = vector.fma %935, %634, %932 : vector<4xf32>
%937 = vector.extract_strided_slice %619 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%938 = vector.extract %937[0] : vector<1xf32>
%939 = splat %938 : vector<4xf32>
%940 = vector.fma %939, %635, %936 : vector<4xf32>
%941 = vector.extract_strided_slice %619 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%942 = vector.extract %941[0] : vector<1xf32>
%943 = splat %942 : vector<4xf32>
%944 = vector.fma %943, %636, %940 : vector<4xf32>
%945 = vector.extract_strided_slice %620 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%946 = vector.extract %945[0] : vector<1xf32>
%947 = splat %946 : vector<4xf32>
%948 = vector.fma %947, %637, %944 : vector<4xf32>
%949 = vector.extract_strided_slice %620 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%950 = vector.extract %949[0] : vector<1xf32>
%951 = splat %950 : vector<4xf32>
%952 = vector.fma %951, %638, %948 : vector<4xf32>
%953 = vector.extract_strided_slice %620 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%954 = vector.extract %953[0] : vector<1xf32>
%955 = splat %954 : vector<4xf32>
%956 = vector.fma %955, %639, %952 : vector<4xf32>
%957 = vector.extract_strided_slice %620 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%958 = vector.extract %957[0] : vector<1xf32>
%959 = splat %958 : vector<4xf32>
%960 = vector.fma %959, %640, %956 : vector<4xf32>
%961 = vector.extract_strided_slice %621 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%962 = vector.extract %961[0] : vector<1xf32>
%963 = splat %962 : vector<4xf32>
%964 = vector.fma %963, %633, %arg13 : vector<4xf32>
%965 = vector.extract_strided_slice %621 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%966 = vector.extract %965[0] : vector<1xf32>
%967 = splat %966 : vector<4xf32>
%968 = vector.fma %967, %634, %964 : vector<4xf32>
%969 = vector.extract_strided_slice %621 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%970 = vector.extract %969[0] : vector<1xf32>
%971 = splat %970 : vector<4xf32>
%972 = vector.fma %971, %635, %968 : vector<4xf32>
%973 = vector.extract_strided_slice %621 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%974 = vector.extract %973[0] : vector<1xf32>
%975 = splat %974 : vector<4xf32>
%976 = vector.fma %975, %636, %972 : vector<4xf32>
%977 = vector.extract_strided_slice %622 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%978 = vector.extract %977[0] : vector<1xf32>
%979 = splat %978 : vector<4xf32>
%980 = vector.fma %979, %637, %976 : vector<4xf32>
%981 = vector.extract_strided_slice %622 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%982 = vector.extract %981[0] : vector<1xf32>
%983 = splat %982 : vector<4xf32>
%984 = vector.fma %983, %638, %980 : vector<4xf32>
%985 = vector.extract_strided_slice %622 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%986 = vector.extract %985[0] : vector<1xf32>
%987 = splat %986 : vector<4xf32>
%988 = vector.fma %987, %639, %984 : vector<4xf32>
%989 = vector.extract_strided_slice %622 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%990 = vector.extract %989[0] : vector<1xf32>
%991 = splat %990 : vector<4xf32>
%992 = vector.fma %991, %640, %988 : vector<4xf32>
%993 = vector.extract_strided_slice %623 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%994 = vector.extract %993[0] : vector<1xf32>
%995 = splat %994 : vector<4xf32>
%996 = vector.fma %995, %633, %arg14 : vector<4xf32>
%997 = vector.extract_strided_slice %623 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%998 = vector.extract %997[0] : vector<1xf32>
%999 = splat %998 : vector<4xf32>
%1000 = vector.fma %999, %634, %996 : vector<4xf32>
%1001 = vector.extract_strided_slice %623 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1002 = vector.extract %1001[0] : vector<1xf32>
%1003 = splat %1002 : vector<4xf32>
%1004 = vector.fma %1003, %635, %1000 : vector<4xf32>
%1005 = vector.extract_strided_slice %623 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1006 = vector.extract %1005[0] : vector<1xf32>
%1007 = splat %1006 : vector<4xf32>
%1008 = vector.fma %1007, %636, %1004 : vector<4xf32>
%1009 = vector.extract_strided_slice %624 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1010 = vector.extract %1009[0] : vector<1xf32>
%1011 = splat %1010 : vector<4xf32>
%1012 = vector.fma %1011, %637, %1008 : vector<4xf32>
%1013 = vector.extract_strided_slice %624 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1014 = vector.extract %1013[0] : vector<1xf32>
%1015 = splat %1014 : vector<4xf32>
%1016 = vector.fma %1015, %638, %1012 : vector<4xf32>
%1017 = vector.extract_strided_slice %624 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1018 = vector.extract %1017[0] : vector<1xf32>
%1019 = splat %1018 : vector<4xf32>
%1020 = vector.fma %1019, %639, %1016 : vector<4xf32>
%1021 = vector.extract_strided_slice %624 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1022 = vector.extract %1021[0] : vector<1xf32>
%1023 = splat %1022 : vector<4xf32>
%1024 = vector.fma %1023, %640, %1020 : vector<4xf32>
%1025 = vector.extract_strided_slice %625 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1026 = vector.extract %1025[0] : vector<1xf32>
%1027 = splat %1026 : vector<4xf32>
%1028 = vector.fma %1027, %633, %arg15 : vector<4xf32>
%1029 = vector.extract_strided_slice %625 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1030 = vector.extract %1029[0] : vector<1xf32>
%1031 = splat %1030 : vector<4xf32>
%1032 = vector.fma %1031, %634, %1028 : vector<4xf32>
%1033 = vector.extract_strided_slice %625 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1034 = vector.extract %1033[0] : vector<1xf32>
%1035 = splat %1034 : vector<4xf32>
%1036 = vector.fma %1035, %635, %1032 : vector<4xf32>
%1037 = vector.extract_strided_slice %625 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1038 = vector.extract %1037[0] : vector<1xf32>
%1039 = splat %1038 : vector<4xf32>
%1040 = vector.fma %1039, %636, %1036 : vector<4xf32>
%1041 = vector.extract_strided_slice %626 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1042 = vector.extract %1041[0] : vector<1xf32>
%1043 = splat %1042 : vector<4xf32>
%1044 = vector.fma %1043, %637, %1040 : vector<4xf32>
%1045 = vector.extract_strided_slice %626 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1046 = vector.extract %1045[0] : vector<1xf32>
%1047 = splat %1046 : vector<4xf32>
%1048 = vector.fma %1047, %638, %1044 : vector<4xf32>
%1049 = vector.extract_strided_slice %626 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1050 = vector.extract %1049[0] : vector<1xf32>
%1051 = splat %1050 : vector<4xf32>
%1052 = vector.fma %1051, %639, %1048 : vector<4xf32>
%1053 = vector.extract_strided_slice %626 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1054 = vector.extract %1053[0] : vector<1xf32>
%1055 = splat %1054 : vector<4xf32>
%1056 = vector.fma %1055, %640, %1052 : vector<4xf32>
%1057 = vector.extract_strided_slice %627 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1058 = vector.extract %1057[0] : vector<1xf32>
%1059 = splat %1058 : vector<4xf32>
%1060 = vector.fma %1059, %633, %arg16 : vector<4xf32>
%1061 = vector.extract_strided_slice %627 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1062 = vector.extract %1061[0] : vector<1xf32>
%1063 = splat %1062 : vector<4xf32>
%1064 = vector.fma %1063, %634, %1060 : vector<4xf32>
%1065 = vector.extract_strided_slice %627 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1066 = vector.extract %1065[0] : vector<1xf32>
%1067 = splat %1066 : vector<4xf32>
%1068 = vector.fma %1067, %635, %1064 : vector<4xf32>
%1069 = vector.extract_strided_slice %627 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1070 = vector.extract %1069[0] : vector<1xf32>
%1071 = splat %1070 : vector<4xf32>
%1072 = vector.fma %1071, %636, %1068 : vector<4xf32>
%1073 = vector.extract_strided_slice %628 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1074 = vector.extract %1073[0] : vector<1xf32>
%1075 = splat %1074 : vector<4xf32>
%1076 = vector.fma %1075, %637, %1072 : vector<4xf32>
%1077 = vector.extract_strided_slice %628 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1078 = vector.extract %1077[0] : vector<1xf32>
%1079 = splat %1078 : vector<4xf32>
%1080 = vector.fma %1079, %638, %1076 : vector<4xf32>
%1081 = vector.extract_strided_slice %628 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1082 = vector.extract %1081[0] : vector<1xf32>
%1083 = splat %1082 : vector<4xf32>
%1084 = vector.fma %1083, %639, %1080 : vector<4xf32>
%1085 = vector.extract_strided_slice %628 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1086 = vector.extract %1085[0] : vector<1xf32>
%1087 = splat %1086 : vector<4xf32>
%1088 = vector.fma %1087, %640, %1084 : vector<4xf32>
%1089 = vector.extract_strided_slice %629 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1090 = vector.extract %1089[0] : vector<1xf32>
%1091 = splat %1090 : vector<4xf32>
%1092 = vector.fma %1091, %633, %arg17 : vector<4xf32>
%1093 = vector.extract_strided_slice %629 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1094 = vector.extract %1093[0] : vector<1xf32>
%1095 = splat %1094 : vector<4xf32>
%1096 = vector.fma %1095, %634, %1092 : vector<4xf32>
%1097 = vector.extract_strided_slice %629 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1098 = vector.extract %1097[0] : vector<1xf32>
%1099 = splat %1098 : vector<4xf32>
%1100 = vector.fma %1099, %635, %1096 : vector<4xf32>
%1101 = vector.extract_strided_slice %629 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1102 = vector.extract %1101[0] : vector<1xf32>
%1103 = splat %1102 : vector<4xf32>
%1104 = vector.fma %1103, %636, %1100 : vector<4xf32>
%1105 = vector.extract_strided_slice %630 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1106 = vector.extract %1105[0] : vector<1xf32>
%1107 = splat %1106 : vector<4xf32>
%1108 = vector.fma %1107, %637, %1104 : vector<4xf32>
%1109 = vector.extract_strided_slice %630 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1110 = vector.extract %1109[0] : vector<1xf32>
%1111 = splat %1110 : vector<4xf32>
%1112 = vector.fma %1111, %638, %1108 : vector<4xf32>
%1113 = vector.extract_strided_slice %630 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1114 = vector.extract %1113[0] : vector<1xf32>
%1115 = splat %1114 : vector<4xf32>
%1116 = vector.fma %1115, %639, %1112 : vector<4xf32>
%1117 = vector.extract_strided_slice %630 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1118 = vector.extract %1117[0] : vector<1xf32>
%1119 = splat %1118 : vector<4xf32>
%1120 = vector.fma %1119, %640, %1116 : vector<4xf32>
%1121 = vector.extract_strided_slice %631 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1122 = vector.extract %1121[0] : vector<1xf32>
%1123 = splat %1122 : vector<4xf32>
%1124 = vector.fma %1123, %633, %arg18 : vector<4xf32>
%1125 = vector.extract_strided_slice %631 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1126 = vector.extract %1125[0] : vector<1xf32>
%1127 = splat %1126 : vector<4xf32>
%1128 = vector.fma %1127, %634, %1124 : vector<4xf32>
%1129 = vector.extract_strided_slice %631 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1130 = vector.extract %1129[0] : vector<1xf32>
%1131 = splat %1130 : vector<4xf32>
%1132 = vector.fma %1131, %635, %1128 : vector<4xf32>
%1133 = vector.extract_strided_slice %631 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1134 = vector.extract %1133[0] : vector<1xf32>
%1135 = splat %1134 : vector<4xf32>
%1136 = vector.fma %1135, %636, %1132 : vector<4xf32>
%1137 = vector.extract_strided_slice %632 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1138 = vector.extract %1137[0] : vector<1xf32>
%1139 = splat %1138 : vector<4xf32>
%1140 = vector.fma %1139, %637, %1136 : vector<4xf32>
%1141 = vector.extract_strided_slice %632 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1142 = vector.extract %1141[0] : vector<1xf32>
%1143 = splat %1142 : vector<4xf32>
%1144 = vector.fma %1143, %638, %1140 : vector<4xf32>
%1145 = vector.extract_strided_slice %632 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1146 = vector.extract %1145[0] : vector<1xf32>
%1147 = splat %1146 : vector<4xf32>
%1148 = vector.fma %1147, %639, %1144 : vector<4xf32>
%1149 = vector.extract_strided_slice %632 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1150 = vector.extract %1149[0] : vector<1xf32>
%1151 = splat %1150 : vector<4xf32>
%1152 = vector.fma %1151, %640, %1148 : vector<4xf32>
%1153 = addi %arg2, %c8 : index
%1154 = memref.subview %39[0, %1153] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%1155 = addi %arg2, %c8 : index
%1156 = memref.subview %40[%1155, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%1157 = vector.transfer_read %1154[%32, %33], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%1158 = vector.transfer_read %1156[%34, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%1159 = vector.transfer_read %1156[%36, %35], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
scf.yield %672, %704, %736, %768, %800, %832, %864, %896, %928, %960, %992, %1024, %1056, %1088, %1120, %1152, %1157, %1158, %1159 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
gpu.barrier
vector.transfer_write %48#16, %30[%32, %33] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %48#17, %31[%34, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %48#18, %31[%36, %35] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%49 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%50 = vector.transfer_read %37[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%51 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%52 = vector.transfer_read %37[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%53 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%54 = vector.transfer_read %37[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%55 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%56 = vector.transfer_read %37[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%57 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%58 = vector.transfer_read %37[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%59 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%60 = vector.transfer_read %37[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%61 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%62 = vector.transfer_read %37[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%63 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%64 = vector.transfer_read %37[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%65 = vector.transfer_read %37[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%66 = vector.transfer_read %37[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%67 = vector.transfer_read %37[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%68 = vector.transfer_read %37[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%69 = vector.transfer_read %37[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%70 = vector.transfer_read %37[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%71 = vector.transfer_read %37[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%72 = vector.transfer_read %37[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%73 = vector.transfer_read %37[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%74 = vector.transfer_read %37[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%75 = vector.transfer_read %37[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%76 = vector.transfer_read %37[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%77 = vector.transfer_read %37[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%78 = vector.transfer_read %37[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%79 = vector.transfer_read %37[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%80 = vector.transfer_read %37[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%81 = vector.transfer_read %38[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%82 = vector.transfer_read %38[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%83 = vector.transfer_read %38[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%84 = vector.transfer_read %38[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%85 = vector.transfer_read %38[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%86 = vector.transfer_read %38[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%87 = vector.transfer_read %38[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%88 = vector.transfer_read %38[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%89 = vector.extract_strided_slice %49 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%90 = vector.extract %89[0] : vector<1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %81, %48#0 : vector<4xf32>
%93 = vector.extract_strided_slice %49 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%94 = vector.extract %93[0] : vector<1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.fma %95, %82, %92 : vector<4xf32>
%97 = vector.extract_strided_slice %49 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%98 = vector.extract %97[0] : vector<1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.fma %99, %83, %96 : vector<4xf32>
%101 = vector.extract_strided_slice %49 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%102 = vector.extract %101[0] : vector<1xf32>
%103 = splat %102 : vector<4xf32>
%104 = vector.fma %103, %84, %100 : vector<4xf32>
%105 = vector.extract_strided_slice %50 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%106 = vector.extract %105[0] : vector<1xf32>
%107 = splat %106 : vector<4xf32>
%108 = vector.fma %107, %85, %104 : vector<4xf32>
%109 = vector.extract_strided_slice %50 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%110 = vector.extract %109[0] : vector<1xf32>
%111 = splat %110 : vector<4xf32>
%112 = vector.fma %111, %86, %108 : vector<4xf32>
%113 = vector.extract_strided_slice %50 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%114 = vector.extract %113[0] : vector<1xf32>
%115 = splat %114 : vector<4xf32>
%116 = vector.fma %115, %87, %112 : vector<4xf32>
%117 = vector.extract_strided_slice %50 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %88, %116 : vector<4xf32>
%121 = vector.extract_strided_slice %51 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%122 = vector.extract %121[0] : vector<1xf32>
%123 = splat %122 : vector<4xf32>
%124 = vector.fma %123, %81, %48#1 : vector<4xf32>
%125 = vector.extract_strided_slice %51 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%126 = vector.extract %125[0] : vector<1xf32>
%127 = splat %126 : vector<4xf32>
%128 = vector.fma %127, %82, %124 : vector<4xf32>
%129 = vector.extract_strided_slice %51 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%130 = vector.extract %129[0] : vector<1xf32>
%131 = splat %130 : vector<4xf32>
%132 = vector.fma %131, %83, %128 : vector<4xf32>
%133 = vector.extract_strided_slice %51 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%134 = vector.extract %133[0] : vector<1xf32>
%135 = splat %134 : vector<4xf32>
%136 = vector.fma %135, %84, %132 : vector<4xf32>
%137 = vector.extract_strided_slice %52 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%138 = vector.extract %137[0] : vector<1xf32>
%139 = splat %138 : vector<4xf32>
%140 = vector.fma %139, %85, %136 : vector<4xf32>
%141 = vector.extract_strided_slice %52 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%142 = vector.extract %141[0] : vector<1xf32>
%143 = splat %142 : vector<4xf32>
%144 = vector.fma %143, %86, %140 : vector<4xf32>
%145 = vector.extract_strided_slice %52 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%146 = vector.extract %145[0] : vector<1xf32>
%147 = splat %146 : vector<4xf32>
%148 = vector.fma %147, %87, %144 : vector<4xf32>
%149 = vector.extract_strided_slice %52 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%150 = vector.extract %149[0] : vector<1xf32>
%151 = splat %150 : vector<4xf32>
%152 = vector.fma %151, %88, %148 : vector<4xf32>
%153 = vector.extract_strided_slice %53 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%154 = vector.extract %153[0] : vector<1xf32>
%155 = splat %154 : vector<4xf32>
%156 = vector.fma %155, %81, %48#2 : vector<4xf32>
%157 = vector.extract_strided_slice %53 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%158 = vector.extract %157[0] : vector<1xf32>
%159 = splat %158 : vector<4xf32>
%160 = vector.fma %159, %82, %156 : vector<4xf32>
%161 = vector.extract_strided_slice %53 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%162 = vector.extract %161[0] : vector<1xf32>
%163 = splat %162 : vector<4xf32>
%164 = vector.fma %163, %83, %160 : vector<4xf32>
%165 = vector.extract_strided_slice %53 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%166 = vector.extract %165[0] : vector<1xf32>
%167 = splat %166 : vector<4xf32>
%168 = vector.fma %167, %84, %164 : vector<4xf32>
%169 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%170 = vector.extract %169[0] : vector<1xf32>
%171 = splat %170 : vector<4xf32>
%172 = vector.fma %171, %85, %168 : vector<4xf32>
%173 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%174 = vector.extract %173[0] : vector<1xf32>
%175 = splat %174 : vector<4xf32>
%176 = vector.fma %175, %86, %172 : vector<4xf32>
%177 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%178 = vector.extract %177[0] : vector<1xf32>
%179 = splat %178 : vector<4xf32>
%180 = vector.fma %179, %87, %176 : vector<4xf32>
%181 = vector.extract_strided_slice %54 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%182 = vector.extract %181[0] : vector<1xf32>
%183 = splat %182 : vector<4xf32>
%184 = vector.fma %183, %88, %180 : vector<4xf32>
%185 = vector.extract_strided_slice %55 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%186 = vector.extract %185[0] : vector<1xf32>
%187 = splat %186 : vector<4xf32>
%188 = vector.fma %187, %81, %48#3 : vector<4xf32>
%189 = vector.extract_strided_slice %55 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%190 = vector.extract %189[0] : vector<1xf32>
%191 = splat %190 : vector<4xf32>
%192 = vector.fma %191, %82, %188 : vector<4xf32>
%193 = vector.extract_strided_slice %55 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%194 = vector.extract %193[0] : vector<1xf32>
%195 = splat %194 : vector<4xf32>
%196 = vector.fma %195, %83, %192 : vector<4xf32>
%197 = vector.extract_strided_slice %55 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%198 = vector.extract %197[0] : vector<1xf32>
%199 = splat %198 : vector<4xf32>
%200 = vector.fma %199, %84, %196 : vector<4xf32>
%201 = vector.extract_strided_slice %56 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%202 = vector.extract %201[0] : vector<1xf32>
%203 = splat %202 : vector<4xf32>
%204 = vector.fma %203, %85, %200 : vector<4xf32>
%205 = vector.extract_strided_slice %56 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%206 = vector.extract %205[0] : vector<1xf32>
%207 = splat %206 : vector<4xf32>
%208 = vector.fma %207, %86, %204 : vector<4xf32>
%209 = vector.extract_strided_slice %56 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%210 = vector.extract %209[0] : vector<1xf32>
%211 = splat %210 : vector<4xf32>
%212 = vector.fma %211, %87, %208 : vector<4xf32>
%213 = vector.extract_strided_slice %56 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%214 = vector.extract %213[0] : vector<1xf32>
%215 = splat %214 : vector<4xf32>
%216 = vector.fma %215, %88, %212 : vector<4xf32>
%217 = vector.extract_strided_slice %57 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%218 = vector.extract %217[0] : vector<1xf32>
%219 = splat %218 : vector<4xf32>
%220 = vector.fma %219, %81, %48#4 : vector<4xf32>
%221 = vector.extract_strided_slice %57 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%222 = vector.extract %221[0] : vector<1xf32>
%223 = splat %222 : vector<4xf32>
%224 = vector.fma %223, %82, %220 : vector<4xf32>
%225 = vector.extract_strided_slice %57 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%226 = vector.extract %225[0] : vector<1xf32>
%227 = splat %226 : vector<4xf32>
%228 = vector.fma %227, %83, %224 : vector<4xf32>
%229 = vector.extract_strided_slice %57 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%230 = vector.extract %229[0] : vector<1xf32>
%231 = splat %230 : vector<4xf32>
%232 = vector.fma %231, %84, %228 : vector<4xf32>
%233 = vector.extract_strided_slice %58 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%234 = vector.extract %233[0] : vector<1xf32>
%235 = splat %234 : vector<4xf32>
%236 = vector.fma %235, %85, %232 : vector<4xf32>
%237 = vector.extract_strided_slice %58 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%238 = vector.extract %237[0] : vector<1xf32>
%239 = splat %238 : vector<4xf32>
%240 = vector.fma %239, %86, %236 : vector<4xf32>
%241 = vector.extract_strided_slice %58 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%242 = vector.extract %241[0] : vector<1xf32>
%243 = splat %242 : vector<4xf32>
%244 = vector.fma %243, %87, %240 : vector<4xf32>
%245 = vector.extract_strided_slice %58 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%246 = vector.extract %245[0] : vector<1xf32>
%247 = splat %246 : vector<4xf32>
%248 = vector.fma %247, %88, %244 : vector<4xf32>
%249 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%250 = vector.extract %249[0] : vector<1xf32>
%251 = splat %250 : vector<4xf32>
%252 = vector.fma %251, %81, %48#5 : vector<4xf32>
%253 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%254 = vector.extract %253[0] : vector<1xf32>
%255 = splat %254 : vector<4xf32>
%256 = vector.fma %255, %82, %252 : vector<4xf32>
%257 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%258 = vector.extract %257[0] : vector<1xf32>
%259 = splat %258 : vector<4xf32>
%260 = vector.fma %259, %83, %256 : vector<4xf32>
%261 = vector.extract_strided_slice %59 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%262 = vector.extract %261[0] : vector<1xf32>
%263 = splat %262 : vector<4xf32>
%264 = vector.fma %263, %84, %260 : vector<4xf32>
%265 = vector.extract_strided_slice %60 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%266 = vector.extract %265[0] : vector<1xf32>
%267 = splat %266 : vector<4xf32>
%268 = vector.fma %267, %85, %264 : vector<4xf32>
%269 = vector.extract_strided_slice %60 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%270 = vector.extract %269[0] : vector<1xf32>
%271 = splat %270 : vector<4xf32>
%272 = vector.fma %271, %86, %268 : vector<4xf32>
%273 = vector.extract_strided_slice %60 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%274 = vector.extract %273[0] : vector<1xf32>
%275 = splat %274 : vector<4xf32>
%276 = vector.fma %275, %87, %272 : vector<4xf32>
%277 = vector.extract_strided_slice %60 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%278 = vector.extract %277[0] : vector<1xf32>
%279 = splat %278 : vector<4xf32>
%280 = vector.fma %279, %88, %276 : vector<4xf32>
%281 = vector.extract_strided_slice %61 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%282 = vector.extract %281[0] : vector<1xf32>
%283 = splat %282 : vector<4xf32>
%284 = vector.fma %283, %81, %48#6 : vector<4xf32>
%285 = vector.extract_strided_slice %61 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%286 = vector.extract %285[0] : vector<1xf32>
%287 = splat %286 : vector<4xf32>
%288 = vector.fma %287, %82, %284 : vector<4xf32>
%289 = vector.extract_strided_slice %61 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%290 = vector.extract %289[0] : vector<1xf32>
%291 = splat %290 : vector<4xf32>
%292 = vector.fma %291, %83, %288 : vector<4xf32>
%293 = vector.extract_strided_slice %61 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%294 = vector.extract %293[0] : vector<1xf32>
%295 = splat %294 : vector<4xf32>
%296 = vector.fma %295, %84, %292 : vector<4xf32>
%297 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%298 = vector.extract %297[0] : vector<1xf32>
%299 = splat %298 : vector<4xf32>
%300 = vector.fma %299, %85, %296 : vector<4xf32>
%301 = vector.extract_strided_slice %62 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%302 = vector.extract %301[0] : vector<1xf32>
%303 = splat %302 : vector<4xf32>
%304 = vector.fma %303, %86, %300 : vector<4xf32>
%305 = vector.extract_strided_slice %62 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%306 = vector.extract %305[0] : vector<1xf32>
%307 = splat %306 : vector<4xf32>
%308 = vector.fma %307, %87, %304 : vector<4xf32>
%309 = vector.extract_strided_slice %62 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%310 = vector.extract %309[0] : vector<1xf32>
%311 = splat %310 : vector<4xf32>
%312 = vector.fma %311, %88, %308 : vector<4xf32>
%313 = vector.extract_strided_slice %63 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%314 = vector.extract %313[0] : vector<1xf32>
%315 = splat %314 : vector<4xf32>
%316 = vector.fma %315, %81, %48#7 : vector<4xf32>
%317 = vector.extract_strided_slice %63 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%318 = vector.extract %317[0] : vector<1xf32>
%319 = splat %318 : vector<4xf32>
%320 = vector.fma %319, %82, %316 : vector<4xf32>
%321 = vector.extract_strided_slice %63 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%322 = vector.extract %321[0] : vector<1xf32>
%323 = splat %322 : vector<4xf32>
%324 = vector.fma %323, %83, %320 : vector<4xf32>
%325 = vector.extract_strided_slice %63 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%326 = vector.extract %325[0] : vector<1xf32>
%327 = splat %326 : vector<4xf32>
%328 = vector.fma %327, %84, %324 : vector<4xf32>
%329 = vector.extract_strided_slice %64 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%330 = vector.extract %329[0] : vector<1xf32>
%331 = splat %330 : vector<4xf32>
%332 = vector.fma %331, %85, %328 : vector<4xf32>
%333 = vector.extract_strided_slice %64 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%334 = vector.extract %333[0] : vector<1xf32>
%335 = splat %334 : vector<4xf32>
%336 = vector.fma %335, %86, %332 : vector<4xf32>
%337 = vector.extract_strided_slice %64 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%338 = vector.extract %337[0] : vector<1xf32>
%339 = splat %338 : vector<4xf32>
%340 = vector.fma %339, %87, %336 : vector<4xf32>
%341 = vector.extract_strided_slice %64 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%342 = vector.extract %341[0] : vector<1xf32>
%343 = splat %342 : vector<4xf32>
%344 = vector.fma %343, %88, %340 : vector<4xf32>
%345 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%346 = vector.extract %345[0] : vector<1xf32>
%347 = splat %346 : vector<4xf32>
%348 = vector.fma %347, %81, %48#8 : vector<4xf32>
%349 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%350 = vector.extract %349[0] : vector<1xf32>
%351 = splat %350 : vector<4xf32>
%352 = vector.fma %351, %82, %348 : vector<4xf32>
%353 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%354 = vector.extract %353[0] : vector<1xf32>
%355 = splat %354 : vector<4xf32>
%356 = vector.fma %355, %83, %352 : vector<4xf32>
%357 = vector.extract_strided_slice %65 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%358 = vector.extract %357[0] : vector<1xf32>
%359 = splat %358 : vector<4xf32>
%360 = vector.fma %359, %84, %356 : vector<4xf32>
%361 = vector.extract_strided_slice %66 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%362 = vector.extract %361[0] : vector<1xf32>
%363 = splat %362 : vector<4xf32>
%364 = vector.fma %363, %85, %360 : vector<4xf32>
%365 = vector.extract_strided_slice %66 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%366 = vector.extract %365[0] : vector<1xf32>
%367 = splat %366 : vector<4xf32>
%368 = vector.fma %367, %86, %364 : vector<4xf32>
%369 = vector.extract_strided_slice %66 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%370 = vector.extract %369[0] : vector<1xf32>
%371 = splat %370 : vector<4xf32>
%372 = vector.fma %371, %87, %368 : vector<4xf32>
%373 = vector.extract_strided_slice %66 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%374 = vector.extract %373[0] : vector<1xf32>
%375 = splat %374 : vector<4xf32>
%376 = vector.fma %375, %88, %372 : vector<4xf32>
%377 = vector.extract_strided_slice %67 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%378 = vector.extract %377[0] : vector<1xf32>
%379 = splat %378 : vector<4xf32>
%380 = vector.fma %379, %81, %48#9 : vector<4xf32>
%381 = vector.extract_strided_slice %67 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%382 = vector.extract %381[0] : vector<1xf32>
%383 = splat %382 : vector<4xf32>
%384 = vector.fma %383, %82, %380 : vector<4xf32>
%385 = vector.extract_strided_slice %67 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%386 = vector.extract %385[0] : vector<1xf32>
%387 = splat %386 : vector<4xf32>
%388 = vector.fma %387, %83, %384 : vector<4xf32>
%389 = vector.extract_strided_slice %67 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%390 = vector.extract %389[0] : vector<1xf32>
%391 = splat %390 : vector<4xf32>
%392 = vector.fma %391, %84, %388 : vector<4xf32>
%393 = vector.extract_strided_slice %68 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%394 = vector.extract %393[0] : vector<1xf32>
%395 = splat %394 : vector<4xf32>
%396 = vector.fma %395, %85, %392 : vector<4xf32>
%397 = vector.extract_strided_slice %68 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%398 = vector.extract %397[0] : vector<1xf32>
%399 = splat %398 : vector<4xf32>
%400 = vector.fma %399, %86, %396 : vector<4xf32>
%401 = vector.extract_strided_slice %68 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%402 = vector.extract %401[0] : vector<1xf32>
%403 = splat %402 : vector<4xf32>
%404 = vector.fma %403, %87, %400 : vector<4xf32>
%405 = vector.extract_strided_slice %68 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%406 = vector.extract %405[0] : vector<1xf32>
%407 = splat %406 : vector<4xf32>
%408 = vector.fma %407, %88, %404 : vector<4xf32>
%409 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%410 = vector.extract %409[0] : vector<1xf32>
%411 = splat %410 : vector<4xf32>
%412 = vector.fma %411, %81, %48#10 : vector<4xf32>
%413 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%414 = vector.extract %413[0] : vector<1xf32>
%415 = splat %414 : vector<4xf32>
%416 = vector.fma %415, %82, %412 : vector<4xf32>
%417 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%418 = vector.extract %417[0] : vector<1xf32>
%419 = splat %418 : vector<4xf32>
%420 = vector.fma %419, %83, %416 : vector<4xf32>
%421 = vector.extract_strided_slice %69 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%422 = vector.extract %421[0] : vector<1xf32>
%423 = splat %422 : vector<4xf32>
%424 = vector.fma %423, %84, %420 : vector<4xf32>
%425 = vector.extract_strided_slice %70 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%426 = vector.extract %425[0] : vector<1xf32>
%427 = splat %426 : vector<4xf32>
%428 = vector.fma %427, %85, %424 : vector<4xf32>
%429 = vector.extract_strided_slice %70 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%430 = vector.extract %429[0] : vector<1xf32>
%431 = splat %430 : vector<4xf32>
%432 = vector.fma %431, %86, %428 : vector<4xf32>
%433 = vector.extract_strided_slice %70 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%434 = vector.extract %433[0] : vector<1xf32>
%435 = splat %434 : vector<4xf32>
%436 = vector.fma %435, %87, %432 : vector<4xf32>
%437 = vector.extract_strided_slice %70 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%438 = vector.extract %437[0] : vector<1xf32>
%439 = splat %438 : vector<4xf32>
%440 = vector.fma %439, %88, %436 : vector<4xf32>
%441 = vector.extract_strided_slice %71 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%442 = vector.extract %441[0] : vector<1xf32>
%443 = splat %442 : vector<4xf32>
%444 = vector.fma %443, %81, %48#11 : vector<4xf32>
%445 = vector.extract_strided_slice %71 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%446 = vector.extract %445[0] : vector<1xf32>
%447 = splat %446 : vector<4xf32>
%448 = vector.fma %447, %82, %444 : vector<4xf32>
%449 = vector.extract_strided_slice %71 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%450 = vector.extract %449[0] : vector<1xf32>
%451 = splat %450 : vector<4xf32>
%452 = vector.fma %451, %83, %448 : vector<4xf32>
%453 = vector.extract_strided_slice %71 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%454 = vector.extract %453[0] : vector<1xf32>
%455 = splat %454 : vector<4xf32>
%456 = vector.fma %455, %84, %452 : vector<4xf32>
%457 = vector.extract_strided_slice %72 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%458 = vector.extract %457[0] : vector<1xf32>
%459 = splat %458 : vector<4xf32>
%460 = vector.fma %459, %85, %456 : vector<4xf32>
%461 = vector.extract_strided_slice %72 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%462 = vector.extract %461[0] : vector<1xf32>
%463 = splat %462 : vector<4xf32>
%464 = vector.fma %463, %86, %460 : vector<4xf32>
%465 = vector.extract_strided_slice %72 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%466 = vector.extract %465[0] : vector<1xf32>
%467 = splat %466 : vector<4xf32>
%468 = vector.fma %467, %87, %464 : vector<4xf32>
%469 = vector.extract_strided_slice %72 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%470 = vector.extract %469[0] : vector<1xf32>
%471 = splat %470 : vector<4xf32>
%472 = vector.fma %471, %88, %468 : vector<4xf32>
%473 = vector.extract_strided_slice %73 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%474 = vector.extract %473[0] : vector<1xf32>
%475 = splat %474 : vector<4xf32>
%476 = vector.fma %475, %81, %48#12 : vector<4xf32>
%477 = vector.extract_strided_slice %73 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%478 = vector.extract %477[0] : vector<1xf32>
%479 = splat %478 : vector<4xf32>
%480 = vector.fma %479, %82, %476 : vector<4xf32>
%481 = vector.extract_strided_slice %73 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%482 = vector.extract %481[0] : vector<1xf32>
%483 = splat %482 : vector<4xf32>
%484 = vector.fma %483, %83, %480 : vector<4xf32>
%485 = vector.extract_strided_slice %73 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%486 = vector.extract %485[0] : vector<1xf32>
%487 = splat %486 : vector<4xf32>
%488 = vector.fma %487, %84, %484 : vector<4xf32>
%489 = vector.extract_strided_slice %74 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%490 = vector.extract %489[0] : vector<1xf32>
%491 = splat %490 : vector<4xf32>
%492 = vector.fma %491, %85, %488 : vector<4xf32>
%493 = vector.extract_strided_slice %74 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%494 = vector.extract %493[0] : vector<1xf32>
%495 = splat %494 : vector<4xf32>
%496 = vector.fma %495, %86, %492 : vector<4xf32>
%497 = vector.extract_strided_slice %74 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%498 = vector.extract %497[0] : vector<1xf32>
%499 = splat %498 : vector<4xf32>
%500 = vector.fma %499, %87, %496 : vector<4xf32>
%501 = vector.extract_strided_slice %74 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%502 = vector.extract %501[0] : vector<1xf32>
%503 = splat %502 : vector<4xf32>
%504 = vector.fma %503, %88, %500 : vector<4xf32>
%505 = vector.extract_strided_slice %75 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%506 = vector.extract %505[0] : vector<1xf32>
%507 = splat %506 : vector<4xf32>
%508 = vector.fma %507, %81, %48#13 : vector<4xf32>
%509 = vector.extract_strided_slice %75 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%510 = vector.extract %509[0] : vector<1xf32>
%511 = splat %510 : vector<4xf32>
%512 = vector.fma %511, %82, %508 : vector<4xf32>
%513 = vector.extract_strided_slice %75 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%514 = vector.extract %513[0] : vector<1xf32>
%515 = splat %514 : vector<4xf32>
%516 = vector.fma %515, %83, %512 : vector<4xf32>
%517 = vector.extract_strided_slice %75 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%518 = vector.extract %517[0] : vector<1xf32>
%519 = splat %518 : vector<4xf32>
%520 = vector.fma %519, %84, %516 : vector<4xf32>
%521 = vector.extract_strided_slice %76 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%522 = vector.extract %521[0] : vector<1xf32>
%523 = splat %522 : vector<4xf32>
%524 = vector.fma %523, %85, %520 : vector<4xf32>
%525 = vector.extract_strided_slice %76 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%526 = vector.extract %525[0] : vector<1xf32>
%527 = splat %526 : vector<4xf32>
%528 = vector.fma %527, %86, %524 : vector<4xf32>
%529 = vector.extract_strided_slice %76 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%530 = vector.extract %529[0] : vector<1xf32>
%531 = splat %530 : vector<4xf32>
%532 = vector.fma %531, %87, %528 : vector<4xf32>
%533 = vector.extract_strided_slice %76 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%534 = vector.extract %533[0] : vector<1xf32>
%535 = splat %534 : vector<4xf32>
%536 = vector.fma %535, %88, %532 : vector<4xf32>
%537 = vector.extract_strided_slice %77 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%538 = vector.extract %537[0] : vector<1xf32>
%539 = splat %538 : vector<4xf32>
%540 = vector.fma %539, %81, %48#14 : vector<4xf32>
%541 = vector.extract_strided_slice %77 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%542 = vector.extract %541[0] : vector<1xf32>
%543 = splat %542 : vector<4xf32>
%544 = vector.fma %543, %82, %540 : vector<4xf32>
%545 = vector.extract_strided_slice %77 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%546 = vector.extract %545[0] : vector<1xf32>
%547 = splat %546 : vector<4xf32>
%548 = vector.fma %547, %83, %544 : vector<4xf32>
%549 = vector.extract_strided_slice %77 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%550 = vector.extract %549[0] : vector<1xf32>
%551 = splat %550 : vector<4xf32>
%552 = vector.fma %551, %84, %548 : vector<4xf32>
%553 = vector.extract_strided_slice %78 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%554 = vector.extract %553[0] : vector<1xf32>
%555 = splat %554 : vector<4xf32>
%556 = vector.fma %555, %85, %552 : vector<4xf32>
%557 = vector.extract_strided_slice %78 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%558 = vector.extract %557[0] : vector<1xf32>
%559 = splat %558 : vector<4xf32>
%560 = vector.fma %559, %86, %556 : vector<4xf32>
%561 = vector.extract_strided_slice %78 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%562 = vector.extract %561[0] : vector<1xf32>
%563 = splat %562 : vector<4xf32>
%564 = vector.fma %563, %87, %560 : vector<4xf32>
%565 = vector.extract_strided_slice %78 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%566 = vector.extract %565[0] : vector<1xf32>
%567 = splat %566 : vector<4xf32>
%568 = vector.fma %567, %88, %564 : vector<4xf32>
%569 = vector.extract_strided_slice %79 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%570 = vector.extract %569[0] : vector<1xf32>
%571 = splat %570 : vector<4xf32>
%572 = vector.fma %571, %81, %48#15 : vector<4xf32>
%573 = vector.extract_strided_slice %79 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%574 = vector.extract %573[0] : vector<1xf32>
%575 = splat %574 : vector<4xf32>
%576 = vector.fma %575, %82, %572 : vector<4xf32>
%577 = vector.extract_strided_slice %79 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%578 = vector.extract %577[0] : vector<1xf32>
%579 = splat %578 : vector<4xf32>
%580 = vector.fma %579, %83, %576 : vector<4xf32>
%581 = vector.extract_strided_slice %79 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%582 = vector.extract %581[0] : vector<1xf32>
%583 = splat %582 : vector<4xf32>
%584 = vector.fma %583, %84, %580 : vector<4xf32>
%585 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%586 = vector.extract %585[0] : vector<1xf32>
%587 = splat %586 : vector<4xf32>
%588 = vector.fma %587, %85, %584 : vector<4xf32>
%589 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%590 = vector.extract %589[0] : vector<1xf32>
%591 = splat %590 : vector<4xf32>
%592 = vector.fma %591, %86, %588 : vector<4xf32>
%593 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%594 = vector.extract %593[0] : vector<1xf32>
%595 = splat %594 : vector<4xf32>
%596 = vector.fma %595, %87, %592 : vector<4xf32>
%597 = vector.extract_strided_slice %80 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%598 = vector.extract %597[0] : vector<1xf32>
%599 = splat %598 : vector<4xf32>
%600 = vector.fma %599, %88, %596 : vector<4xf32>
vector.transfer_write %600, %42[%c15, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %568, %42[%c14, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %536, %42[%c13, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %504, %42[%c12, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %472, %42[%c11, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %440, %42[%c10, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %408, %42[%c9, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %376, %42[%c8, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %344, %42[%c7, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %312, %42[%c6, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %280, %42[%c5, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %248, %42[%c4, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %216, %42[%c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %184, %42[%c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %152, %42[%c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %120, %42[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After ConvertAffineToStandard //----- //
module {
memref.global "private" @__shared_memory___0 : memref<8x128xf32, 3>
memref.global "private" @__shared_memory__ : memref<64x8xf32, 3>
func @_large_aligned_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c15 = constant 15 : index
%c14 = constant 14 : index
%c13 = constant 13 : index
%c12 = constant 12 : index
%c11 = constant 11 : index
%c10 = constant 10 : index
%c9 = constant 9 : index
%c7 = constant 7 : index
%c6 = constant 6 : index
%c5 = constant 5 : index
%c4 = constant 4 : index
%c3 = constant 3 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c512 = constant 512 : index
%c2048 = constant 2048 : index
%cst_0 = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1016 = constant 1016 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%c64 = constant 64 : index
%8 = muli %workgroup_id_y, %c64 : index
%c64_1 = constant 64 : index
%9 = muli %workgroup_count_y, %c64_1 : index
%c128 = constant 128 : index
%10 = muli %workgroup_id_x, %c128 : index
%c128_2 = constant 128 : index
%11 = muli %workgroup_count_x, %c128_2 : index
%c16 = constant 16 : index
%12 = muli %1, %c16 : index
%c4_3 = constant 4 : index
%13 = muli %0, %c4_3 : index
%14 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%19 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%20 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%21 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%22 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%23 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%24 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%25 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%26 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%27 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%28 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%29 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%30 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%31 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%c16_4 = constant 16 : index
%32 = muli %1, %c16_4 : index
%c64_5 = constant 64 : index
%33 = muli %2, %c64_5 : index
%34 = addi %32, %33 : index
%c2_6 = constant 2 : index
%c0_7 = constant 0 : index
%c-1 = constant -1 : index
%35 = cmpi slt, %0, %c0_7 : index
%36 = subi %c-1, %0 : index
%37 = select %35, %36, %0 : index
%38 = divi_signed %37, %c2_6 : index
%39 = subi %c-1, %38 : index
%40 = select %35, %39, %38 : index
%41 = addi %34, %40 : index
%c4_8 = constant 4 : index
%42 = muli %0, %c4_8 : index
%c2_9 = constant 2 : index
%c0_10 = constant 0 : index
%c-1_11 = constant -1 : index
%43 = cmpi slt, %0, %c0_10 : index
%44 = subi %c-1_11, %0 : index
%45 = select %43, %44, %0 : index
%46 = divi_signed %45, %c2_9 : index
%47 = subi %c-1_11, %46 : index
%48 = select %43, %47, %46 : index
%c-8 = constant -8 : index
%49 = muli %48, %c-8 : index
%50 = addi %42, %49 : index
%c4_12 = constant 4 : index
%51 = muli %2, %c4_12 : index
%52 = addi %1, %51 : index
%c32 = constant 32 : index
%c0_13 = constant 0 : index
%c-1_14 = constant -1 : index
%53 = cmpi slt, %0, %c0_13 : index
%54 = subi %c-1_14, %0 : index
%55 = select %53, %54, %0 : index
%56 = divi_signed %55, %c32 : index
%57 = subi %c-1_14, %56 : index
%58 = select %53, %57, %56 : index
%59 = addi %52, %58 : index
%c4_15 = constant 4 : index
%60 = muli %0, %c4_15 : index
%c32_16 = constant 32 : index
%c0_17 = constant 0 : index
%c-1_18 = constant -1 : index
%61 = cmpi slt, %0, %c0_17 : index
%62 = subi %c-1_18, %0 : index
%63 = select %61, %62, %0 : index
%64 = divi_signed %63, %c32_16 : index
%65 = subi %c-1_18, %64 : index
%66 = select %61, %65, %64 : index
%c-128 = constant -128 : index
%67 = muli %66, %c-128 : index
%68 = addi %60, %67 : index
%c4_19 = constant 4 : index
%69 = muli %2, %c4_19 : index
%70 = addi %1, %69 : index
%c32_20 = constant 32 : index
%c0_21 = constant 0 : index
%c-1_22 = constant -1 : index
%71 = cmpi slt, %0, %c0_21 : index
%72 = subi %c-1_22, %0 : index
%73 = select %71, %72, %0 : index
%74 = divi_signed %73, %c32_20 : index
%75 = subi %c-1_22, %74 : index
%76 = select %71, %75, %74 : index
%77 = addi %70, %76 : index
%c4_23 = constant 4 : index
%78 = addi %77, %c4_23 : index
%79 = memref.subview %30[%12, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%80 = memref.subview %31[0, %13] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
scf.for %arg0 = %8 to %c2048 step %9 {
%81 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
scf.for %arg1 = %10 to %c512 step %11 {
%82 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%83 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%84 = memref.subview %83[%12, %13] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%85 = memref.subview %81[0, %c0] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%86 = memref.subview %82[%c0, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%87 = vector.transfer_read %85[%41, %50], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%88 = vector.transfer_read %86[%59, %68], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%89 = vector.transfer_read %86[%78, %68], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%90:19 = scf.for %arg2 = %c0 to %c1016 step %c8 iter_args(%arg3 = %14, %arg4 = %15, %arg5 = %16, %arg6 = %17, %arg7 = %18, %arg8 = %19, %arg9 = %20, %arg10 = %21, %arg11 = %22, %arg12 = %23, %arg13 = %24, %arg14 = %25, %arg15 = %26, %arg16 = %27, %arg17 = %28, %arg18 = %29, %arg19 = %87, %arg20 = %88, %arg21 = %89) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
gpu.barrier
vector.transfer_write %arg19, %30[%41, %50] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %arg20, %31[%59, %68] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %arg21, %31[%78, %68] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%643 = vector.transfer_read %79[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%644 = vector.transfer_read %79[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%645 = vector.transfer_read %79[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%646 = vector.transfer_read %79[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%647 = vector.transfer_read %79[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%648 = vector.transfer_read %79[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%649 = vector.transfer_read %79[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%650 = vector.transfer_read %79[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%651 = vector.transfer_read %79[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%652 = vector.transfer_read %79[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%653 = vector.transfer_read %79[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%654 = vector.transfer_read %79[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%655 = vector.transfer_read %79[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%656 = vector.transfer_read %79[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%657 = vector.transfer_read %79[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%658 = vector.transfer_read %79[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%659 = vector.transfer_read %79[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%660 = vector.transfer_read %79[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%661 = vector.transfer_read %79[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%662 = vector.transfer_read %79[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%663 = vector.transfer_read %79[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%664 = vector.transfer_read %79[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%665 = vector.transfer_read %79[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%666 = vector.transfer_read %79[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%667 = vector.transfer_read %79[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%668 = vector.transfer_read %79[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%669 = vector.transfer_read %79[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%670 = vector.transfer_read %79[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%671 = vector.transfer_read %79[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%672 = vector.transfer_read %79[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%673 = vector.transfer_read %79[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%674 = vector.transfer_read %79[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%675 = vector.transfer_read %80[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%676 = vector.transfer_read %80[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%677 = vector.transfer_read %80[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%678 = vector.transfer_read %80[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%679 = vector.transfer_read %80[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%680 = vector.transfer_read %80[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%681 = vector.transfer_read %80[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%682 = vector.transfer_read %80[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%683 = vector.extract_strided_slice %643 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%684 = vector.extract %683[0] : vector<1xf32>
%685 = splat %684 : vector<4xf32>
%686 = vector.fma %685, %675, %arg3 : vector<4xf32>
%687 = vector.extract_strided_slice %643 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%688 = vector.extract %687[0] : vector<1xf32>
%689 = splat %688 : vector<4xf32>
%690 = vector.fma %689, %676, %686 : vector<4xf32>
%691 = vector.extract_strided_slice %643 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%692 = vector.extract %691[0] : vector<1xf32>
%693 = splat %692 : vector<4xf32>
%694 = vector.fma %693, %677, %690 : vector<4xf32>
%695 = vector.extract_strided_slice %643 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%696 = vector.extract %695[0] : vector<1xf32>
%697 = splat %696 : vector<4xf32>
%698 = vector.fma %697, %678, %694 : vector<4xf32>
%699 = vector.extract_strided_slice %644 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%700 = vector.extract %699[0] : vector<1xf32>
%701 = splat %700 : vector<4xf32>
%702 = vector.fma %701, %679, %698 : vector<4xf32>
%703 = vector.extract_strided_slice %644 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%704 = vector.extract %703[0] : vector<1xf32>
%705 = splat %704 : vector<4xf32>
%706 = vector.fma %705, %680, %702 : vector<4xf32>
%707 = vector.extract_strided_slice %644 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%708 = vector.extract %707[0] : vector<1xf32>
%709 = splat %708 : vector<4xf32>
%710 = vector.fma %709, %681, %706 : vector<4xf32>
%711 = vector.extract_strided_slice %644 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%712 = vector.extract %711[0] : vector<1xf32>
%713 = splat %712 : vector<4xf32>
%714 = vector.fma %713, %682, %710 : vector<4xf32>
%715 = vector.extract_strided_slice %645 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%716 = vector.extract %715[0] : vector<1xf32>
%717 = splat %716 : vector<4xf32>
%718 = vector.fma %717, %675, %arg4 : vector<4xf32>
%719 = vector.extract_strided_slice %645 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%720 = vector.extract %719[0] : vector<1xf32>
%721 = splat %720 : vector<4xf32>
%722 = vector.fma %721, %676, %718 : vector<4xf32>
%723 = vector.extract_strided_slice %645 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%724 = vector.extract %723[0] : vector<1xf32>
%725 = splat %724 : vector<4xf32>
%726 = vector.fma %725, %677, %722 : vector<4xf32>
%727 = vector.extract_strided_slice %645 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%728 = vector.extract %727[0] : vector<1xf32>
%729 = splat %728 : vector<4xf32>
%730 = vector.fma %729, %678, %726 : vector<4xf32>
%731 = vector.extract_strided_slice %646 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%732 = vector.extract %731[0] : vector<1xf32>
%733 = splat %732 : vector<4xf32>
%734 = vector.fma %733, %679, %730 : vector<4xf32>
%735 = vector.extract_strided_slice %646 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%736 = vector.extract %735[0] : vector<1xf32>
%737 = splat %736 : vector<4xf32>
%738 = vector.fma %737, %680, %734 : vector<4xf32>
%739 = vector.extract_strided_slice %646 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%740 = vector.extract %739[0] : vector<1xf32>
%741 = splat %740 : vector<4xf32>
%742 = vector.fma %741, %681, %738 : vector<4xf32>
%743 = vector.extract_strided_slice %646 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%744 = vector.extract %743[0] : vector<1xf32>
%745 = splat %744 : vector<4xf32>
%746 = vector.fma %745, %682, %742 : vector<4xf32>
%747 = vector.extract_strided_slice %647 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%748 = vector.extract %747[0] : vector<1xf32>
%749 = splat %748 : vector<4xf32>
%750 = vector.fma %749, %675, %arg5 : vector<4xf32>
%751 = vector.extract_strided_slice %647 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%752 = vector.extract %751[0] : vector<1xf32>
%753 = splat %752 : vector<4xf32>
%754 = vector.fma %753, %676, %750 : vector<4xf32>
%755 = vector.extract_strided_slice %647 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%756 = vector.extract %755[0] : vector<1xf32>
%757 = splat %756 : vector<4xf32>
%758 = vector.fma %757, %677, %754 : vector<4xf32>
%759 = vector.extract_strided_slice %647 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%760 = vector.extract %759[0] : vector<1xf32>
%761 = splat %760 : vector<4xf32>
%762 = vector.fma %761, %678, %758 : vector<4xf32>
%763 = vector.extract_strided_slice %648 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%764 = vector.extract %763[0] : vector<1xf32>
%765 = splat %764 : vector<4xf32>
%766 = vector.fma %765, %679, %762 : vector<4xf32>
%767 = vector.extract_strided_slice %648 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%768 = vector.extract %767[0] : vector<1xf32>
%769 = splat %768 : vector<4xf32>
%770 = vector.fma %769, %680, %766 : vector<4xf32>
%771 = vector.extract_strided_slice %648 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%772 = vector.extract %771[0] : vector<1xf32>
%773 = splat %772 : vector<4xf32>
%774 = vector.fma %773, %681, %770 : vector<4xf32>
%775 = vector.extract_strided_slice %648 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%776 = vector.extract %775[0] : vector<1xf32>
%777 = splat %776 : vector<4xf32>
%778 = vector.fma %777, %682, %774 : vector<4xf32>
%779 = vector.extract_strided_slice %649 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%780 = vector.extract %779[0] : vector<1xf32>
%781 = splat %780 : vector<4xf32>
%782 = vector.fma %781, %675, %arg6 : vector<4xf32>
%783 = vector.extract_strided_slice %649 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%784 = vector.extract %783[0] : vector<1xf32>
%785 = splat %784 : vector<4xf32>
%786 = vector.fma %785, %676, %782 : vector<4xf32>
%787 = vector.extract_strided_slice %649 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%788 = vector.extract %787[0] : vector<1xf32>
%789 = splat %788 : vector<4xf32>
%790 = vector.fma %789, %677, %786 : vector<4xf32>
%791 = vector.extract_strided_slice %649 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%792 = vector.extract %791[0] : vector<1xf32>
%793 = splat %792 : vector<4xf32>
%794 = vector.fma %793, %678, %790 : vector<4xf32>
%795 = vector.extract_strided_slice %650 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%796 = vector.extract %795[0] : vector<1xf32>
%797 = splat %796 : vector<4xf32>
%798 = vector.fma %797, %679, %794 : vector<4xf32>
%799 = vector.extract_strided_slice %650 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%800 = vector.extract %799[0] : vector<1xf32>
%801 = splat %800 : vector<4xf32>
%802 = vector.fma %801, %680, %798 : vector<4xf32>
%803 = vector.extract_strided_slice %650 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%804 = vector.extract %803[0] : vector<1xf32>
%805 = splat %804 : vector<4xf32>
%806 = vector.fma %805, %681, %802 : vector<4xf32>
%807 = vector.extract_strided_slice %650 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%808 = vector.extract %807[0] : vector<1xf32>
%809 = splat %808 : vector<4xf32>
%810 = vector.fma %809, %682, %806 : vector<4xf32>
%811 = vector.extract_strided_slice %651 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%812 = vector.extract %811[0] : vector<1xf32>
%813 = splat %812 : vector<4xf32>
%814 = vector.fma %813, %675, %arg7 : vector<4xf32>
%815 = vector.extract_strided_slice %651 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%816 = vector.extract %815[0] : vector<1xf32>
%817 = splat %816 : vector<4xf32>
%818 = vector.fma %817, %676, %814 : vector<4xf32>
%819 = vector.extract_strided_slice %651 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%820 = vector.extract %819[0] : vector<1xf32>
%821 = splat %820 : vector<4xf32>
%822 = vector.fma %821, %677, %818 : vector<4xf32>
%823 = vector.extract_strided_slice %651 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%824 = vector.extract %823[0] : vector<1xf32>
%825 = splat %824 : vector<4xf32>
%826 = vector.fma %825, %678, %822 : vector<4xf32>
%827 = vector.extract_strided_slice %652 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%828 = vector.extract %827[0] : vector<1xf32>
%829 = splat %828 : vector<4xf32>
%830 = vector.fma %829, %679, %826 : vector<4xf32>
%831 = vector.extract_strided_slice %652 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%832 = vector.extract %831[0] : vector<1xf32>
%833 = splat %832 : vector<4xf32>
%834 = vector.fma %833, %680, %830 : vector<4xf32>
%835 = vector.extract_strided_slice %652 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%836 = vector.extract %835[0] : vector<1xf32>
%837 = splat %836 : vector<4xf32>
%838 = vector.fma %837, %681, %834 : vector<4xf32>
%839 = vector.extract_strided_slice %652 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%840 = vector.extract %839[0] : vector<1xf32>
%841 = splat %840 : vector<4xf32>
%842 = vector.fma %841, %682, %838 : vector<4xf32>
%843 = vector.extract_strided_slice %653 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%844 = vector.extract %843[0] : vector<1xf32>
%845 = splat %844 : vector<4xf32>
%846 = vector.fma %845, %675, %arg8 : vector<4xf32>
%847 = vector.extract_strided_slice %653 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%848 = vector.extract %847[0] : vector<1xf32>
%849 = splat %848 : vector<4xf32>
%850 = vector.fma %849, %676, %846 : vector<4xf32>
%851 = vector.extract_strided_slice %653 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%852 = vector.extract %851[0] : vector<1xf32>
%853 = splat %852 : vector<4xf32>
%854 = vector.fma %853, %677, %850 : vector<4xf32>
%855 = vector.extract_strided_slice %653 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%856 = vector.extract %855[0] : vector<1xf32>
%857 = splat %856 : vector<4xf32>
%858 = vector.fma %857, %678, %854 : vector<4xf32>
%859 = vector.extract_strided_slice %654 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%860 = vector.extract %859[0] : vector<1xf32>
%861 = splat %860 : vector<4xf32>
%862 = vector.fma %861, %679, %858 : vector<4xf32>
%863 = vector.extract_strided_slice %654 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%864 = vector.extract %863[0] : vector<1xf32>
%865 = splat %864 : vector<4xf32>
%866 = vector.fma %865, %680, %862 : vector<4xf32>
%867 = vector.extract_strided_slice %654 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%868 = vector.extract %867[0] : vector<1xf32>
%869 = splat %868 : vector<4xf32>
%870 = vector.fma %869, %681, %866 : vector<4xf32>
%871 = vector.extract_strided_slice %654 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%872 = vector.extract %871[0] : vector<1xf32>
%873 = splat %872 : vector<4xf32>
%874 = vector.fma %873, %682, %870 : vector<4xf32>
%875 = vector.extract_strided_slice %655 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%876 = vector.extract %875[0] : vector<1xf32>
%877 = splat %876 : vector<4xf32>
%878 = vector.fma %877, %675, %arg9 : vector<4xf32>
%879 = vector.extract_strided_slice %655 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%880 = vector.extract %879[0] : vector<1xf32>
%881 = splat %880 : vector<4xf32>
%882 = vector.fma %881, %676, %878 : vector<4xf32>
%883 = vector.extract_strided_slice %655 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%884 = vector.extract %883[0] : vector<1xf32>
%885 = splat %884 : vector<4xf32>
%886 = vector.fma %885, %677, %882 : vector<4xf32>
%887 = vector.extract_strided_slice %655 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%888 = vector.extract %887[0] : vector<1xf32>
%889 = splat %888 : vector<4xf32>
%890 = vector.fma %889, %678, %886 : vector<4xf32>
%891 = vector.extract_strided_slice %656 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%892 = vector.extract %891[0] : vector<1xf32>
%893 = splat %892 : vector<4xf32>
%894 = vector.fma %893, %679, %890 : vector<4xf32>
%895 = vector.extract_strided_slice %656 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%896 = vector.extract %895[0] : vector<1xf32>
%897 = splat %896 : vector<4xf32>
%898 = vector.fma %897, %680, %894 : vector<4xf32>
%899 = vector.extract_strided_slice %656 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%900 = vector.extract %899[0] : vector<1xf32>
%901 = splat %900 : vector<4xf32>
%902 = vector.fma %901, %681, %898 : vector<4xf32>
%903 = vector.extract_strided_slice %656 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%904 = vector.extract %903[0] : vector<1xf32>
%905 = splat %904 : vector<4xf32>
%906 = vector.fma %905, %682, %902 : vector<4xf32>
%907 = vector.extract_strided_slice %657 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%908 = vector.extract %907[0] : vector<1xf32>
%909 = splat %908 : vector<4xf32>
%910 = vector.fma %909, %675, %arg10 : vector<4xf32>
%911 = vector.extract_strided_slice %657 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%912 = vector.extract %911[0] : vector<1xf32>
%913 = splat %912 : vector<4xf32>
%914 = vector.fma %913, %676, %910 : vector<4xf32>
%915 = vector.extract_strided_slice %657 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%916 = vector.extract %915[0] : vector<1xf32>
%917 = splat %916 : vector<4xf32>
%918 = vector.fma %917, %677, %914 : vector<4xf32>
%919 = vector.extract_strided_slice %657 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%920 = vector.extract %919[0] : vector<1xf32>
%921 = splat %920 : vector<4xf32>
%922 = vector.fma %921, %678, %918 : vector<4xf32>
%923 = vector.extract_strided_slice %658 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%924 = vector.extract %923[0] : vector<1xf32>
%925 = splat %924 : vector<4xf32>
%926 = vector.fma %925, %679, %922 : vector<4xf32>
%927 = vector.extract_strided_slice %658 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%928 = vector.extract %927[0] : vector<1xf32>
%929 = splat %928 : vector<4xf32>
%930 = vector.fma %929, %680, %926 : vector<4xf32>
%931 = vector.extract_strided_slice %658 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%932 = vector.extract %931[0] : vector<1xf32>
%933 = splat %932 : vector<4xf32>
%934 = vector.fma %933, %681, %930 : vector<4xf32>
%935 = vector.extract_strided_slice %658 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%936 = vector.extract %935[0] : vector<1xf32>
%937 = splat %936 : vector<4xf32>
%938 = vector.fma %937, %682, %934 : vector<4xf32>
%939 = vector.extract_strided_slice %659 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%940 = vector.extract %939[0] : vector<1xf32>
%941 = splat %940 : vector<4xf32>
%942 = vector.fma %941, %675, %arg11 : vector<4xf32>
%943 = vector.extract_strided_slice %659 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%944 = vector.extract %943[0] : vector<1xf32>
%945 = splat %944 : vector<4xf32>
%946 = vector.fma %945, %676, %942 : vector<4xf32>
%947 = vector.extract_strided_slice %659 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%948 = vector.extract %947[0] : vector<1xf32>
%949 = splat %948 : vector<4xf32>
%950 = vector.fma %949, %677, %946 : vector<4xf32>
%951 = vector.extract_strided_slice %659 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%952 = vector.extract %951[0] : vector<1xf32>
%953 = splat %952 : vector<4xf32>
%954 = vector.fma %953, %678, %950 : vector<4xf32>
%955 = vector.extract_strided_slice %660 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%956 = vector.extract %955[0] : vector<1xf32>
%957 = splat %956 : vector<4xf32>
%958 = vector.fma %957, %679, %954 : vector<4xf32>
%959 = vector.extract_strided_slice %660 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%960 = vector.extract %959[0] : vector<1xf32>
%961 = splat %960 : vector<4xf32>
%962 = vector.fma %961, %680, %958 : vector<4xf32>
%963 = vector.extract_strided_slice %660 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%964 = vector.extract %963[0] : vector<1xf32>
%965 = splat %964 : vector<4xf32>
%966 = vector.fma %965, %681, %962 : vector<4xf32>
%967 = vector.extract_strided_slice %660 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%968 = vector.extract %967[0] : vector<1xf32>
%969 = splat %968 : vector<4xf32>
%970 = vector.fma %969, %682, %966 : vector<4xf32>
%971 = vector.extract_strided_slice %661 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%972 = vector.extract %971[0] : vector<1xf32>
%973 = splat %972 : vector<4xf32>
%974 = vector.fma %973, %675, %arg12 : vector<4xf32>
%975 = vector.extract_strided_slice %661 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%976 = vector.extract %975[0] : vector<1xf32>
%977 = splat %976 : vector<4xf32>
%978 = vector.fma %977, %676, %974 : vector<4xf32>
%979 = vector.extract_strided_slice %661 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%980 = vector.extract %979[0] : vector<1xf32>
%981 = splat %980 : vector<4xf32>
%982 = vector.fma %981, %677, %978 : vector<4xf32>
%983 = vector.extract_strided_slice %661 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%984 = vector.extract %983[0] : vector<1xf32>
%985 = splat %984 : vector<4xf32>
%986 = vector.fma %985, %678, %982 : vector<4xf32>
%987 = vector.extract_strided_slice %662 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%988 = vector.extract %987[0] : vector<1xf32>
%989 = splat %988 : vector<4xf32>
%990 = vector.fma %989, %679, %986 : vector<4xf32>
%991 = vector.extract_strided_slice %662 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%992 = vector.extract %991[0] : vector<1xf32>
%993 = splat %992 : vector<4xf32>
%994 = vector.fma %993, %680, %990 : vector<4xf32>
%995 = vector.extract_strided_slice %662 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%996 = vector.extract %995[0] : vector<1xf32>
%997 = splat %996 : vector<4xf32>
%998 = vector.fma %997, %681, %994 : vector<4xf32>
%999 = vector.extract_strided_slice %662 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1000 = vector.extract %999[0] : vector<1xf32>
%1001 = splat %1000 : vector<4xf32>
%1002 = vector.fma %1001, %682, %998 : vector<4xf32>
%1003 = vector.extract_strided_slice %663 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1004 = vector.extract %1003[0] : vector<1xf32>
%1005 = splat %1004 : vector<4xf32>
%1006 = vector.fma %1005, %675, %arg13 : vector<4xf32>
%1007 = vector.extract_strided_slice %663 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1008 = vector.extract %1007[0] : vector<1xf32>
%1009 = splat %1008 : vector<4xf32>
%1010 = vector.fma %1009, %676, %1006 : vector<4xf32>
%1011 = vector.extract_strided_slice %663 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1012 = vector.extract %1011[0] : vector<1xf32>
%1013 = splat %1012 : vector<4xf32>
%1014 = vector.fma %1013, %677, %1010 : vector<4xf32>
%1015 = vector.extract_strided_slice %663 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1016 = vector.extract %1015[0] : vector<1xf32>
%1017 = splat %1016 : vector<4xf32>
%1018 = vector.fma %1017, %678, %1014 : vector<4xf32>
%1019 = vector.extract_strided_slice %664 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1020 = vector.extract %1019[0] : vector<1xf32>
%1021 = splat %1020 : vector<4xf32>
%1022 = vector.fma %1021, %679, %1018 : vector<4xf32>
%1023 = vector.extract_strided_slice %664 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1024 = vector.extract %1023[0] : vector<1xf32>
%1025 = splat %1024 : vector<4xf32>
%1026 = vector.fma %1025, %680, %1022 : vector<4xf32>
%1027 = vector.extract_strided_slice %664 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1028 = vector.extract %1027[0] : vector<1xf32>
%1029 = splat %1028 : vector<4xf32>
%1030 = vector.fma %1029, %681, %1026 : vector<4xf32>
%1031 = vector.extract_strided_slice %664 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1032 = vector.extract %1031[0] : vector<1xf32>
%1033 = splat %1032 : vector<4xf32>
%1034 = vector.fma %1033, %682, %1030 : vector<4xf32>
%1035 = vector.extract_strided_slice %665 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1036 = vector.extract %1035[0] : vector<1xf32>
%1037 = splat %1036 : vector<4xf32>
%1038 = vector.fma %1037, %675, %arg14 : vector<4xf32>
%1039 = vector.extract_strided_slice %665 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1040 = vector.extract %1039[0] : vector<1xf32>
%1041 = splat %1040 : vector<4xf32>
%1042 = vector.fma %1041, %676, %1038 : vector<4xf32>
%1043 = vector.extract_strided_slice %665 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1044 = vector.extract %1043[0] : vector<1xf32>
%1045 = splat %1044 : vector<4xf32>
%1046 = vector.fma %1045, %677, %1042 : vector<4xf32>
%1047 = vector.extract_strided_slice %665 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1048 = vector.extract %1047[0] : vector<1xf32>
%1049 = splat %1048 : vector<4xf32>
%1050 = vector.fma %1049, %678, %1046 : vector<4xf32>
%1051 = vector.extract_strided_slice %666 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1052 = vector.extract %1051[0] : vector<1xf32>
%1053 = splat %1052 : vector<4xf32>
%1054 = vector.fma %1053, %679, %1050 : vector<4xf32>
%1055 = vector.extract_strided_slice %666 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1056 = vector.extract %1055[0] : vector<1xf32>
%1057 = splat %1056 : vector<4xf32>
%1058 = vector.fma %1057, %680, %1054 : vector<4xf32>
%1059 = vector.extract_strided_slice %666 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1060 = vector.extract %1059[0] : vector<1xf32>
%1061 = splat %1060 : vector<4xf32>
%1062 = vector.fma %1061, %681, %1058 : vector<4xf32>
%1063 = vector.extract_strided_slice %666 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1064 = vector.extract %1063[0] : vector<1xf32>
%1065 = splat %1064 : vector<4xf32>
%1066 = vector.fma %1065, %682, %1062 : vector<4xf32>
%1067 = vector.extract_strided_slice %667 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1068 = vector.extract %1067[0] : vector<1xf32>
%1069 = splat %1068 : vector<4xf32>
%1070 = vector.fma %1069, %675, %arg15 : vector<4xf32>
%1071 = vector.extract_strided_slice %667 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1072 = vector.extract %1071[0] : vector<1xf32>
%1073 = splat %1072 : vector<4xf32>
%1074 = vector.fma %1073, %676, %1070 : vector<4xf32>
%1075 = vector.extract_strided_slice %667 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1076 = vector.extract %1075[0] : vector<1xf32>
%1077 = splat %1076 : vector<4xf32>
%1078 = vector.fma %1077, %677, %1074 : vector<4xf32>
%1079 = vector.extract_strided_slice %667 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1080 = vector.extract %1079[0] : vector<1xf32>
%1081 = splat %1080 : vector<4xf32>
%1082 = vector.fma %1081, %678, %1078 : vector<4xf32>
%1083 = vector.extract_strided_slice %668 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1084 = vector.extract %1083[0] : vector<1xf32>
%1085 = splat %1084 : vector<4xf32>
%1086 = vector.fma %1085, %679, %1082 : vector<4xf32>
%1087 = vector.extract_strided_slice %668 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1088 = vector.extract %1087[0] : vector<1xf32>
%1089 = splat %1088 : vector<4xf32>
%1090 = vector.fma %1089, %680, %1086 : vector<4xf32>
%1091 = vector.extract_strided_slice %668 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1092 = vector.extract %1091[0] : vector<1xf32>
%1093 = splat %1092 : vector<4xf32>
%1094 = vector.fma %1093, %681, %1090 : vector<4xf32>
%1095 = vector.extract_strided_slice %668 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1096 = vector.extract %1095[0] : vector<1xf32>
%1097 = splat %1096 : vector<4xf32>
%1098 = vector.fma %1097, %682, %1094 : vector<4xf32>
%1099 = vector.extract_strided_slice %669 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1100 = vector.extract %1099[0] : vector<1xf32>
%1101 = splat %1100 : vector<4xf32>
%1102 = vector.fma %1101, %675, %arg16 : vector<4xf32>
%1103 = vector.extract_strided_slice %669 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1104 = vector.extract %1103[0] : vector<1xf32>
%1105 = splat %1104 : vector<4xf32>
%1106 = vector.fma %1105, %676, %1102 : vector<4xf32>
%1107 = vector.extract_strided_slice %669 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1108 = vector.extract %1107[0] : vector<1xf32>
%1109 = splat %1108 : vector<4xf32>
%1110 = vector.fma %1109, %677, %1106 : vector<4xf32>
%1111 = vector.extract_strided_slice %669 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1112 = vector.extract %1111[0] : vector<1xf32>
%1113 = splat %1112 : vector<4xf32>
%1114 = vector.fma %1113, %678, %1110 : vector<4xf32>
%1115 = vector.extract_strided_slice %670 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1116 = vector.extract %1115[0] : vector<1xf32>
%1117 = splat %1116 : vector<4xf32>
%1118 = vector.fma %1117, %679, %1114 : vector<4xf32>
%1119 = vector.extract_strided_slice %670 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1120 = vector.extract %1119[0] : vector<1xf32>
%1121 = splat %1120 : vector<4xf32>
%1122 = vector.fma %1121, %680, %1118 : vector<4xf32>
%1123 = vector.extract_strided_slice %670 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1124 = vector.extract %1123[0] : vector<1xf32>
%1125 = splat %1124 : vector<4xf32>
%1126 = vector.fma %1125, %681, %1122 : vector<4xf32>
%1127 = vector.extract_strided_slice %670 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1128 = vector.extract %1127[0] : vector<1xf32>
%1129 = splat %1128 : vector<4xf32>
%1130 = vector.fma %1129, %682, %1126 : vector<4xf32>
%1131 = vector.extract_strided_slice %671 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1132 = vector.extract %1131[0] : vector<1xf32>
%1133 = splat %1132 : vector<4xf32>
%1134 = vector.fma %1133, %675, %arg17 : vector<4xf32>
%1135 = vector.extract_strided_slice %671 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1136 = vector.extract %1135[0] : vector<1xf32>
%1137 = splat %1136 : vector<4xf32>
%1138 = vector.fma %1137, %676, %1134 : vector<4xf32>
%1139 = vector.extract_strided_slice %671 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1140 = vector.extract %1139[0] : vector<1xf32>
%1141 = splat %1140 : vector<4xf32>
%1142 = vector.fma %1141, %677, %1138 : vector<4xf32>
%1143 = vector.extract_strided_slice %671 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1144 = vector.extract %1143[0] : vector<1xf32>
%1145 = splat %1144 : vector<4xf32>
%1146 = vector.fma %1145, %678, %1142 : vector<4xf32>
%1147 = vector.extract_strided_slice %672 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1148 = vector.extract %1147[0] : vector<1xf32>
%1149 = splat %1148 : vector<4xf32>
%1150 = vector.fma %1149, %679, %1146 : vector<4xf32>
%1151 = vector.extract_strided_slice %672 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1152 = vector.extract %1151[0] : vector<1xf32>
%1153 = splat %1152 : vector<4xf32>
%1154 = vector.fma %1153, %680, %1150 : vector<4xf32>
%1155 = vector.extract_strided_slice %672 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1156 = vector.extract %1155[0] : vector<1xf32>
%1157 = splat %1156 : vector<4xf32>
%1158 = vector.fma %1157, %681, %1154 : vector<4xf32>
%1159 = vector.extract_strided_slice %672 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1160 = vector.extract %1159[0] : vector<1xf32>
%1161 = splat %1160 : vector<4xf32>
%1162 = vector.fma %1161, %682, %1158 : vector<4xf32>
%1163 = vector.extract_strided_slice %673 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1164 = vector.extract %1163[0] : vector<1xf32>
%1165 = splat %1164 : vector<4xf32>
%1166 = vector.fma %1165, %675, %arg18 : vector<4xf32>
%1167 = vector.extract_strided_slice %673 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1168 = vector.extract %1167[0] : vector<1xf32>
%1169 = splat %1168 : vector<4xf32>
%1170 = vector.fma %1169, %676, %1166 : vector<4xf32>
%1171 = vector.extract_strided_slice %673 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1172 = vector.extract %1171[0] : vector<1xf32>
%1173 = splat %1172 : vector<4xf32>
%1174 = vector.fma %1173, %677, %1170 : vector<4xf32>
%1175 = vector.extract_strided_slice %673 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1176 = vector.extract %1175[0] : vector<1xf32>
%1177 = splat %1176 : vector<4xf32>
%1178 = vector.fma %1177, %678, %1174 : vector<4xf32>
%1179 = vector.extract_strided_slice %674 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1180 = vector.extract %1179[0] : vector<1xf32>
%1181 = splat %1180 : vector<4xf32>
%1182 = vector.fma %1181, %679, %1178 : vector<4xf32>
%1183 = vector.extract_strided_slice %674 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1184 = vector.extract %1183[0] : vector<1xf32>
%1185 = splat %1184 : vector<4xf32>
%1186 = vector.fma %1185, %680, %1182 : vector<4xf32>
%1187 = vector.extract_strided_slice %674 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1188 = vector.extract %1187[0] : vector<1xf32>
%1189 = splat %1188 : vector<4xf32>
%1190 = vector.fma %1189, %681, %1186 : vector<4xf32>
%1191 = vector.extract_strided_slice %674 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1192 = vector.extract %1191[0] : vector<1xf32>
%1193 = splat %1192 : vector<4xf32>
%1194 = vector.fma %1193, %682, %1190 : vector<4xf32>
%1195 = addi %arg2, %c8 : index
%1196 = memref.subview %81[0, %1195] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%1197 = addi %arg2, %c8 : index
%1198 = memref.subview %82[%1197, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%1199 = vector.transfer_read %1196[%41, %50], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%1200 = vector.transfer_read %1198[%59, %68], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%1201 = vector.transfer_read %1198[%78, %68], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
scf.yield %714, %746, %778, %810, %842, %874, %906, %938, %970, %1002, %1034, %1066, %1098, %1130, %1162, %1194, %1199, %1200, %1201 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
gpu.barrier
vector.transfer_write %90#16, %30[%41, %50] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %90#17, %31[%59, %68] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %90#18, %31[%78, %68] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%91 = vector.transfer_read %79[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%92 = vector.transfer_read %79[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%93 = vector.transfer_read %79[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%94 = vector.transfer_read %79[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%95 = vector.transfer_read %79[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%96 = vector.transfer_read %79[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%97 = vector.transfer_read %79[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%98 = vector.transfer_read %79[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%99 = vector.transfer_read %79[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%100 = vector.transfer_read %79[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%101 = vector.transfer_read %79[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%102 = vector.transfer_read %79[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%103 = vector.transfer_read %79[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%104 = vector.transfer_read %79[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%105 = vector.transfer_read %79[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%106 = vector.transfer_read %79[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%107 = vector.transfer_read %79[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%108 = vector.transfer_read %79[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%109 = vector.transfer_read %79[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%110 = vector.transfer_read %79[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%111 = vector.transfer_read %79[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%112 = vector.transfer_read %79[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%113 = vector.transfer_read %79[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%114 = vector.transfer_read %79[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%115 = vector.transfer_read %79[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%116 = vector.transfer_read %79[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%117 = vector.transfer_read %79[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%118 = vector.transfer_read %79[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%119 = vector.transfer_read %79[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%120 = vector.transfer_read %79[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%121 = vector.transfer_read %79[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%122 = vector.transfer_read %79[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%123 = vector.transfer_read %80[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%124 = vector.transfer_read %80[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%125 = vector.transfer_read %80[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%126 = vector.transfer_read %80[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%127 = vector.transfer_read %80[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%128 = vector.transfer_read %80[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%129 = vector.transfer_read %80[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%130 = vector.transfer_read %80[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%131 = vector.extract_strided_slice %91 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%132 = vector.extract %131[0] : vector<1xf32>
%133 = splat %132 : vector<4xf32>
%134 = vector.fma %133, %123, %90#0 : vector<4xf32>
%135 = vector.extract_strided_slice %91 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%136 = vector.extract %135[0] : vector<1xf32>
%137 = splat %136 : vector<4xf32>
%138 = vector.fma %137, %124, %134 : vector<4xf32>
%139 = vector.extract_strided_slice %91 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%140 = vector.extract %139[0] : vector<1xf32>
%141 = splat %140 : vector<4xf32>
%142 = vector.fma %141, %125, %138 : vector<4xf32>
%143 = vector.extract_strided_slice %91 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%144 = vector.extract %143[0] : vector<1xf32>
%145 = splat %144 : vector<4xf32>
%146 = vector.fma %145, %126, %142 : vector<4xf32>
%147 = vector.extract_strided_slice %92 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%148 = vector.extract %147[0] : vector<1xf32>
%149 = splat %148 : vector<4xf32>
%150 = vector.fma %149, %127, %146 : vector<4xf32>
%151 = vector.extract_strided_slice %92 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%152 = vector.extract %151[0] : vector<1xf32>
%153 = splat %152 : vector<4xf32>
%154 = vector.fma %153, %128, %150 : vector<4xf32>
%155 = vector.extract_strided_slice %92 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%156 = vector.extract %155[0] : vector<1xf32>
%157 = splat %156 : vector<4xf32>
%158 = vector.fma %157, %129, %154 : vector<4xf32>
%159 = vector.extract_strided_slice %92 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%160 = vector.extract %159[0] : vector<1xf32>
%161 = splat %160 : vector<4xf32>
%162 = vector.fma %161, %130, %158 : vector<4xf32>
%163 = vector.extract_strided_slice %93 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%164 = vector.extract %163[0] : vector<1xf32>
%165 = splat %164 : vector<4xf32>
%166 = vector.fma %165, %123, %90#1 : vector<4xf32>
%167 = vector.extract_strided_slice %93 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%168 = vector.extract %167[0] : vector<1xf32>
%169 = splat %168 : vector<4xf32>
%170 = vector.fma %169, %124, %166 : vector<4xf32>
%171 = vector.extract_strided_slice %93 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%172 = vector.extract %171[0] : vector<1xf32>
%173 = splat %172 : vector<4xf32>
%174 = vector.fma %173, %125, %170 : vector<4xf32>
%175 = vector.extract_strided_slice %93 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%176 = vector.extract %175[0] : vector<1xf32>
%177 = splat %176 : vector<4xf32>
%178 = vector.fma %177, %126, %174 : vector<4xf32>
%179 = vector.extract_strided_slice %94 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%180 = vector.extract %179[0] : vector<1xf32>
%181 = splat %180 : vector<4xf32>
%182 = vector.fma %181, %127, %178 : vector<4xf32>
%183 = vector.extract_strided_slice %94 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%184 = vector.extract %183[0] : vector<1xf32>
%185 = splat %184 : vector<4xf32>
%186 = vector.fma %185, %128, %182 : vector<4xf32>
%187 = vector.extract_strided_slice %94 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%188 = vector.extract %187[0] : vector<1xf32>
%189 = splat %188 : vector<4xf32>
%190 = vector.fma %189, %129, %186 : vector<4xf32>
%191 = vector.extract_strided_slice %94 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%192 = vector.extract %191[0] : vector<1xf32>
%193 = splat %192 : vector<4xf32>
%194 = vector.fma %193, %130, %190 : vector<4xf32>
%195 = vector.extract_strided_slice %95 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%196 = vector.extract %195[0] : vector<1xf32>
%197 = splat %196 : vector<4xf32>
%198 = vector.fma %197, %123, %90#2 : vector<4xf32>
%199 = vector.extract_strided_slice %95 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%200 = vector.extract %199[0] : vector<1xf32>
%201 = splat %200 : vector<4xf32>
%202 = vector.fma %201, %124, %198 : vector<4xf32>
%203 = vector.extract_strided_slice %95 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%204 = vector.extract %203[0] : vector<1xf32>
%205 = splat %204 : vector<4xf32>
%206 = vector.fma %205, %125, %202 : vector<4xf32>
%207 = vector.extract_strided_slice %95 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%208 = vector.extract %207[0] : vector<1xf32>
%209 = splat %208 : vector<4xf32>
%210 = vector.fma %209, %126, %206 : vector<4xf32>
%211 = vector.extract_strided_slice %96 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%212 = vector.extract %211[0] : vector<1xf32>
%213 = splat %212 : vector<4xf32>
%214 = vector.fma %213, %127, %210 : vector<4xf32>
%215 = vector.extract_strided_slice %96 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%216 = vector.extract %215[0] : vector<1xf32>
%217 = splat %216 : vector<4xf32>
%218 = vector.fma %217, %128, %214 : vector<4xf32>
%219 = vector.extract_strided_slice %96 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%220 = vector.extract %219[0] : vector<1xf32>
%221 = splat %220 : vector<4xf32>
%222 = vector.fma %221, %129, %218 : vector<4xf32>
%223 = vector.extract_strided_slice %96 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%224 = vector.extract %223[0] : vector<1xf32>
%225 = splat %224 : vector<4xf32>
%226 = vector.fma %225, %130, %222 : vector<4xf32>
%227 = vector.extract_strided_slice %97 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%228 = vector.extract %227[0] : vector<1xf32>
%229 = splat %228 : vector<4xf32>
%230 = vector.fma %229, %123, %90#3 : vector<4xf32>
%231 = vector.extract_strided_slice %97 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%232 = vector.extract %231[0] : vector<1xf32>
%233 = splat %232 : vector<4xf32>
%234 = vector.fma %233, %124, %230 : vector<4xf32>
%235 = vector.extract_strided_slice %97 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%236 = vector.extract %235[0] : vector<1xf32>
%237 = splat %236 : vector<4xf32>
%238 = vector.fma %237, %125, %234 : vector<4xf32>
%239 = vector.extract_strided_slice %97 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%240 = vector.extract %239[0] : vector<1xf32>
%241 = splat %240 : vector<4xf32>
%242 = vector.fma %241, %126, %238 : vector<4xf32>
%243 = vector.extract_strided_slice %98 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%244 = vector.extract %243[0] : vector<1xf32>
%245 = splat %244 : vector<4xf32>
%246 = vector.fma %245, %127, %242 : vector<4xf32>
%247 = vector.extract_strided_slice %98 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%248 = vector.extract %247[0] : vector<1xf32>
%249 = splat %248 : vector<4xf32>
%250 = vector.fma %249, %128, %246 : vector<4xf32>
%251 = vector.extract_strided_slice %98 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%252 = vector.extract %251[0] : vector<1xf32>
%253 = splat %252 : vector<4xf32>
%254 = vector.fma %253, %129, %250 : vector<4xf32>
%255 = vector.extract_strided_slice %98 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%256 = vector.extract %255[0] : vector<1xf32>
%257 = splat %256 : vector<4xf32>
%258 = vector.fma %257, %130, %254 : vector<4xf32>
%259 = vector.extract_strided_slice %99 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%260 = vector.extract %259[0] : vector<1xf32>
%261 = splat %260 : vector<4xf32>
%262 = vector.fma %261, %123, %90#4 : vector<4xf32>
%263 = vector.extract_strided_slice %99 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%264 = vector.extract %263[0] : vector<1xf32>
%265 = splat %264 : vector<4xf32>
%266 = vector.fma %265, %124, %262 : vector<4xf32>
%267 = vector.extract_strided_slice %99 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%268 = vector.extract %267[0] : vector<1xf32>
%269 = splat %268 : vector<4xf32>
%270 = vector.fma %269, %125, %266 : vector<4xf32>
%271 = vector.extract_strided_slice %99 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%272 = vector.extract %271[0] : vector<1xf32>
%273 = splat %272 : vector<4xf32>
%274 = vector.fma %273, %126, %270 : vector<4xf32>
%275 = vector.extract_strided_slice %100 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%276 = vector.extract %275[0] : vector<1xf32>
%277 = splat %276 : vector<4xf32>
%278 = vector.fma %277, %127, %274 : vector<4xf32>
%279 = vector.extract_strided_slice %100 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%280 = vector.extract %279[0] : vector<1xf32>
%281 = splat %280 : vector<4xf32>
%282 = vector.fma %281, %128, %278 : vector<4xf32>
%283 = vector.extract_strided_slice %100 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%284 = vector.extract %283[0] : vector<1xf32>
%285 = splat %284 : vector<4xf32>
%286 = vector.fma %285, %129, %282 : vector<4xf32>
%287 = vector.extract_strided_slice %100 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%288 = vector.extract %287[0] : vector<1xf32>
%289 = splat %288 : vector<4xf32>
%290 = vector.fma %289, %130, %286 : vector<4xf32>
%291 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%292 = vector.extract %291[0] : vector<1xf32>
%293 = splat %292 : vector<4xf32>
%294 = vector.fma %293, %123, %90#5 : vector<4xf32>
%295 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%296 = vector.extract %295[0] : vector<1xf32>
%297 = splat %296 : vector<4xf32>
%298 = vector.fma %297, %124, %294 : vector<4xf32>
%299 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%300 = vector.extract %299[0] : vector<1xf32>
%301 = splat %300 : vector<4xf32>
%302 = vector.fma %301, %125, %298 : vector<4xf32>
%303 = vector.extract_strided_slice %101 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%304 = vector.extract %303[0] : vector<1xf32>
%305 = splat %304 : vector<4xf32>
%306 = vector.fma %305, %126, %302 : vector<4xf32>
%307 = vector.extract_strided_slice %102 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%308 = vector.extract %307[0] : vector<1xf32>
%309 = splat %308 : vector<4xf32>
%310 = vector.fma %309, %127, %306 : vector<4xf32>
%311 = vector.extract_strided_slice %102 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%312 = vector.extract %311[0] : vector<1xf32>
%313 = splat %312 : vector<4xf32>
%314 = vector.fma %313, %128, %310 : vector<4xf32>
%315 = vector.extract_strided_slice %102 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%316 = vector.extract %315[0] : vector<1xf32>
%317 = splat %316 : vector<4xf32>
%318 = vector.fma %317, %129, %314 : vector<4xf32>
%319 = vector.extract_strided_slice %102 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%320 = vector.extract %319[0] : vector<1xf32>
%321 = splat %320 : vector<4xf32>
%322 = vector.fma %321, %130, %318 : vector<4xf32>
%323 = vector.extract_strided_slice %103 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%324 = vector.extract %323[0] : vector<1xf32>
%325 = splat %324 : vector<4xf32>
%326 = vector.fma %325, %123, %90#6 : vector<4xf32>
%327 = vector.extract_strided_slice %103 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%328 = vector.extract %327[0] : vector<1xf32>
%329 = splat %328 : vector<4xf32>
%330 = vector.fma %329, %124, %326 : vector<4xf32>
%331 = vector.extract_strided_slice %103 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%332 = vector.extract %331[0] : vector<1xf32>
%333 = splat %332 : vector<4xf32>
%334 = vector.fma %333, %125, %330 : vector<4xf32>
%335 = vector.extract_strided_slice %103 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%336 = vector.extract %335[0] : vector<1xf32>
%337 = splat %336 : vector<4xf32>
%338 = vector.fma %337, %126, %334 : vector<4xf32>
%339 = vector.extract_strided_slice %104 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%340 = vector.extract %339[0] : vector<1xf32>
%341 = splat %340 : vector<4xf32>
%342 = vector.fma %341, %127, %338 : vector<4xf32>
%343 = vector.extract_strided_slice %104 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%344 = vector.extract %343[0] : vector<1xf32>
%345 = splat %344 : vector<4xf32>
%346 = vector.fma %345, %128, %342 : vector<4xf32>
%347 = vector.extract_strided_slice %104 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%348 = vector.extract %347[0] : vector<1xf32>
%349 = splat %348 : vector<4xf32>
%350 = vector.fma %349, %129, %346 : vector<4xf32>
%351 = vector.extract_strided_slice %104 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%352 = vector.extract %351[0] : vector<1xf32>
%353 = splat %352 : vector<4xf32>
%354 = vector.fma %353, %130, %350 : vector<4xf32>
%355 = vector.extract_strided_slice %105 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%356 = vector.extract %355[0] : vector<1xf32>
%357 = splat %356 : vector<4xf32>
%358 = vector.fma %357, %123, %90#7 : vector<4xf32>
%359 = vector.extract_strided_slice %105 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%360 = vector.extract %359[0] : vector<1xf32>
%361 = splat %360 : vector<4xf32>
%362 = vector.fma %361, %124, %358 : vector<4xf32>
%363 = vector.extract_strided_slice %105 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%364 = vector.extract %363[0] : vector<1xf32>
%365 = splat %364 : vector<4xf32>
%366 = vector.fma %365, %125, %362 : vector<4xf32>
%367 = vector.extract_strided_slice %105 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%368 = vector.extract %367[0] : vector<1xf32>
%369 = splat %368 : vector<4xf32>
%370 = vector.fma %369, %126, %366 : vector<4xf32>
%371 = vector.extract_strided_slice %106 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%372 = vector.extract %371[0] : vector<1xf32>
%373 = splat %372 : vector<4xf32>
%374 = vector.fma %373, %127, %370 : vector<4xf32>
%375 = vector.extract_strided_slice %106 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%376 = vector.extract %375[0] : vector<1xf32>
%377 = splat %376 : vector<4xf32>
%378 = vector.fma %377, %128, %374 : vector<4xf32>
%379 = vector.extract_strided_slice %106 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%380 = vector.extract %379[0] : vector<1xf32>
%381 = splat %380 : vector<4xf32>
%382 = vector.fma %381, %129, %378 : vector<4xf32>
%383 = vector.extract_strided_slice %106 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%384 = vector.extract %383[0] : vector<1xf32>
%385 = splat %384 : vector<4xf32>
%386 = vector.fma %385, %130, %382 : vector<4xf32>
%387 = vector.extract_strided_slice %107 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%388 = vector.extract %387[0] : vector<1xf32>
%389 = splat %388 : vector<4xf32>
%390 = vector.fma %389, %123, %90#8 : vector<4xf32>
%391 = vector.extract_strided_slice %107 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%392 = vector.extract %391[0] : vector<1xf32>
%393 = splat %392 : vector<4xf32>
%394 = vector.fma %393, %124, %390 : vector<4xf32>
%395 = vector.extract_strided_slice %107 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%396 = vector.extract %395[0] : vector<1xf32>
%397 = splat %396 : vector<4xf32>
%398 = vector.fma %397, %125, %394 : vector<4xf32>
%399 = vector.extract_strided_slice %107 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%400 = vector.extract %399[0] : vector<1xf32>
%401 = splat %400 : vector<4xf32>
%402 = vector.fma %401, %126, %398 : vector<4xf32>
%403 = vector.extract_strided_slice %108 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%404 = vector.extract %403[0] : vector<1xf32>
%405 = splat %404 : vector<4xf32>
%406 = vector.fma %405, %127, %402 : vector<4xf32>
%407 = vector.extract_strided_slice %108 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%408 = vector.extract %407[0] : vector<1xf32>
%409 = splat %408 : vector<4xf32>
%410 = vector.fma %409, %128, %406 : vector<4xf32>
%411 = vector.extract_strided_slice %108 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%412 = vector.extract %411[0] : vector<1xf32>
%413 = splat %412 : vector<4xf32>
%414 = vector.fma %413, %129, %410 : vector<4xf32>
%415 = vector.extract_strided_slice %108 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%416 = vector.extract %415[0] : vector<1xf32>
%417 = splat %416 : vector<4xf32>
%418 = vector.fma %417, %130, %414 : vector<4xf32>
%419 = vector.extract_strided_slice %109 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%420 = vector.extract %419[0] : vector<1xf32>
%421 = splat %420 : vector<4xf32>
%422 = vector.fma %421, %123, %90#9 : vector<4xf32>
%423 = vector.extract_strided_slice %109 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%424 = vector.extract %423[0] : vector<1xf32>
%425 = splat %424 : vector<4xf32>
%426 = vector.fma %425, %124, %422 : vector<4xf32>
%427 = vector.extract_strided_slice %109 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%428 = vector.extract %427[0] : vector<1xf32>
%429 = splat %428 : vector<4xf32>
%430 = vector.fma %429, %125, %426 : vector<4xf32>
%431 = vector.extract_strided_slice %109 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%432 = vector.extract %431[0] : vector<1xf32>
%433 = splat %432 : vector<4xf32>
%434 = vector.fma %433, %126, %430 : vector<4xf32>
%435 = vector.extract_strided_slice %110 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%436 = vector.extract %435[0] : vector<1xf32>
%437 = splat %436 : vector<4xf32>
%438 = vector.fma %437, %127, %434 : vector<4xf32>
%439 = vector.extract_strided_slice %110 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%440 = vector.extract %439[0] : vector<1xf32>
%441 = splat %440 : vector<4xf32>
%442 = vector.fma %441, %128, %438 : vector<4xf32>
%443 = vector.extract_strided_slice %110 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%444 = vector.extract %443[0] : vector<1xf32>
%445 = splat %444 : vector<4xf32>
%446 = vector.fma %445, %129, %442 : vector<4xf32>
%447 = vector.extract_strided_slice %110 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%448 = vector.extract %447[0] : vector<1xf32>
%449 = splat %448 : vector<4xf32>
%450 = vector.fma %449, %130, %446 : vector<4xf32>
%451 = vector.extract_strided_slice %111 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%452 = vector.extract %451[0] : vector<1xf32>
%453 = splat %452 : vector<4xf32>
%454 = vector.fma %453, %123, %90#10 : vector<4xf32>
%455 = vector.extract_strided_slice %111 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%456 = vector.extract %455[0] : vector<1xf32>
%457 = splat %456 : vector<4xf32>
%458 = vector.fma %457, %124, %454 : vector<4xf32>
%459 = vector.extract_strided_slice %111 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%460 = vector.extract %459[0] : vector<1xf32>
%461 = splat %460 : vector<4xf32>
%462 = vector.fma %461, %125, %458 : vector<4xf32>
%463 = vector.extract_strided_slice %111 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%464 = vector.extract %463[0] : vector<1xf32>
%465 = splat %464 : vector<4xf32>
%466 = vector.fma %465, %126, %462 : vector<4xf32>
%467 = vector.extract_strided_slice %112 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%468 = vector.extract %467[0] : vector<1xf32>
%469 = splat %468 : vector<4xf32>
%470 = vector.fma %469, %127, %466 : vector<4xf32>
%471 = vector.extract_strided_slice %112 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%472 = vector.extract %471[0] : vector<1xf32>
%473 = splat %472 : vector<4xf32>
%474 = vector.fma %473, %128, %470 : vector<4xf32>
%475 = vector.extract_strided_slice %112 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%476 = vector.extract %475[0] : vector<1xf32>
%477 = splat %476 : vector<4xf32>
%478 = vector.fma %477, %129, %474 : vector<4xf32>
%479 = vector.extract_strided_slice %112 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%480 = vector.extract %479[0] : vector<1xf32>
%481 = splat %480 : vector<4xf32>
%482 = vector.fma %481, %130, %478 : vector<4xf32>
%483 = vector.extract_strided_slice %113 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%484 = vector.extract %483[0] : vector<1xf32>
%485 = splat %484 : vector<4xf32>
%486 = vector.fma %485, %123, %90#11 : vector<4xf32>
%487 = vector.extract_strided_slice %113 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%488 = vector.extract %487[0] : vector<1xf32>
%489 = splat %488 : vector<4xf32>
%490 = vector.fma %489, %124, %486 : vector<4xf32>
%491 = vector.extract_strided_slice %113 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%492 = vector.extract %491[0] : vector<1xf32>
%493 = splat %492 : vector<4xf32>
%494 = vector.fma %493, %125, %490 : vector<4xf32>
%495 = vector.extract_strided_slice %113 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%496 = vector.extract %495[0] : vector<1xf32>
%497 = splat %496 : vector<4xf32>
%498 = vector.fma %497, %126, %494 : vector<4xf32>
%499 = vector.extract_strided_slice %114 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%500 = vector.extract %499[0] : vector<1xf32>
%501 = splat %500 : vector<4xf32>
%502 = vector.fma %501, %127, %498 : vector<4xf32>
%503 = vector.extract_strided_slice %114 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%504 = vector.extract %503[0] : vector<1xf32>
%505 = splat %504 : vector<4xf32>
%506 = vector.fma %505, %128, %502 : vector<4xf32>
%507 = vector.extract_strided_slice %114 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%508 = vector.extract %507[0] : vector<1xf32>
%509 = splat %508 : vector<4xf32>
%510 = vector.fma %509, %129, %506 : vector<4xf32>
%511 = vector.extract_strided_slice %114 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%512 = vector.extract %511[0] : vector<1xf32>
%513 = splat %512 : vector<4xf32>
%514 = vector.fma %513, %130, %510 : vector<4xf32>
%515 = vector.extract_strided_slice %115 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%516 = vector.extract %515[0] : vector<1xf32>
%517 = splat %516 : vector<4xf32>
%518 = vector.fma %517, %123, %90#12 : vector<4xf32>
%519 = vector.extract_strided_slice %115 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%520 = vector.extract %519[0] : vector<1xf32>
%521 = splat %520 : vector<4xf32>
%522 = vector.fma %521, %124, %518 : vector<4xf32>
%523 = vector.extract_strided_slice %115 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%524 = vector.extract %523[0] : vector<1xf32>
%525 = splat %524 : vector<4xf32>
%526 = vector.fma %525, %125, %522 : vector<4xf32>
%527 = vector.extract_strided_slice %115 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%528 = vector.extract %527[0] : vector<1xf32>
%529 = splat %528 : vector<4xf32>
%530 = vector.fma %529, %126, %526 : vector<4xf32>
%531 = vector.extract_strided_slice %116 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%532 = vector.extract %531[0] : vector<1xf32>
%533 = splat %532 : vector<4xf32>
%534 = vector.fma %533, %127, %530 : vector<4xf32>
%535 = vector.extract_strided_slice %116 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%536 = vector.extract %535[0] : vector<1xf32>
%537 = splat %536 : vector<4xf32>
%538 = vector.fma %537, %128, %534 : vector<4xf32>
%539 = vector.extract_strided_slice %116 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%540 = vector.extract %539[0] : vector<1xf32>
%541 = splat %540 : vector<4xf32>
%542 = vector.fma %541, %129, %538 : vector<4xf32>
%543 = vector.extract_strided_slice %116 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%544 = vector.extract %543[0] : vector<1xf32>
%545 = splat %544 : vector<4xf32>
%546 = vector.fma %545, %130, %542 : vector<4xf32>
%547 = vector.extract_strided_slice %117 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%548 = vector.extract %547[0] : vector<1xf32>
%549 = splat %548 : vector<4xf32>
%550 = vector.fma %549, %123, %90#13 : vector<4xf32>
%551 = vector.extract_strided_slice %117 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%552 = vector.extract %551[0] : vector<1xf32>
%553 = splat %552 : vector<4xf32>
%554 = vector.fma %553, %124, %550 : vector<4xf32>
%555 = vector.extract_strided_slice %117 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%556 = vector.extract %555[0] : vector<1xf32>
%557 = splat %556 : vector<4xf32>
%558 = vector.fma %557, %125, %554 : vector<4xf32>
%559 = vector.extract_strided_slice %117 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%560 = vector.extract %559[0] : vector<1xf32>
%561 = splat %560 : vector<4xf32>
%562 = vector.fma %561, %126, %558 : vector<4xf32>
%563 = vector.extract_strided_slice %118 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%564 = vector.extract %563[0] : vector<1xf32>
%565 = splat %564 : vector<4xf32>
%566 = vector.fma %565, %127, %562 : vector<4xf32>
%567 = vector.extract_strided_slice %118 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%568 = vector.extract %567[0] : vector<1xf32>
%569 = splat %568 : vector<4xf32>
%570 = vector.fma %569, %128, %566 : vector<4xf32>
%571 = vector.extract_strided_slice %118 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%572 = vector.extract %571[0] : vector<1xf32>
%573 = splat %572 : vector<4xf32>
%574 = vector.fma %573, %129, %570 : vector<4xf32>
%575 = vector.extract_strided_slice %118 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%576 = vector.extract %575[0] : vector<1xf32>
%577 = splat %576 : vector<4xf32>
%578 = vector.fma %577, %130, %574 : vector<4xf32>
%579 = vector.extract_strided_slice %119 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%580 = vector.extract %579[0] : vector<1xf32>
%581 = splat %580 : vector<4xf32>
%582 = vector.fma %581, %123, %90#14 : vector<4xf32>
%583 = vector.extract_strided_slice %119 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%584 = vector.extract %583[0] : vector<1xf32>
%585 = splat %584 : vector<4xf32>
%586 = vector.fma %585, %124, %582 : vector<4xf32>
%587 = vector.extract_strided_slice %119 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%588 = vector.extract %587[0] : vector<1xf32>
%589 = splat %588 : vector<4xf32>
%590 = vector.fma %589, %125, %586 : vector<4xf32>
%591 = vector.extract_strided_slice %119 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%592 = vector.extract %591[0] : vector<1xf32>
%593 = splat %592 : vector<4xf32>
%594 = vector.fma %593, %126, %590 : vector<4xf32>
%595 = vector.extract_strided_slice %120 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%596 = vector.extract %595[0] : vector<1xf32>
%597 = splat %596 : vector<4xf32>
%598 = vector.fma %597, %127, %594 : vector<4xf32>
%599 = vector.extract_strided_slice %120 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%600 = vector.extract %599[0] : vector<1xf32>
%601 = splat %600 : vector<4xf32>
%602 = vector.fma %601, %128, %598 : vector<4xf32>
%603 = vector.extract_strided_slice %120 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%604 = vector.extract %603[0] : vector<1xf32>
%605 = splat %604 : vector<4xf32>
%606 = vector.fma %605, %129, %602 : vector<4xf32>
%607 = vector.extract_strided_slice %120 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%608 = vector.extract %607[0] : vector<1xf32>
%609 = splat %608 : vector<4xf32>
%610 = vector.fma %609, %130, %606 : vector<4xf32>
%611 = vector.extract_strided_slice %121 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%612 = vector.extract %611[0] : vector<1xf32>
%613 = splat %612 : vector<4xf32>
%614 = vector.fma %613, %123, %90#15 : vector<4xf32>
%615 = vector.extract_strided_slice %121 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%616 = vector.extract %615[0] : vector<1xf32>
%617 = splat %616 : vector<4xf32>
%618 = vector.fma %617, %124, %614 : vector<4xf32>
%619 = vector.extract_strided_slice %121 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%620 = vector.extract %619[0] : vector<1xf32>
%621 = splat %620 : vector<4xf32>
%622 = vector.fma %621, %125, %618 : vector<4xf32>
%623 = vector.extract_strided_slice %121 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%624 = vector.extract %623[0] : vector<1xf32>
%625 = splat %624 : vector<4xf32>
%626 = vector.fma %625, %126, %622 : vector<4xf32>
%627 = vector.extract_strided_slice %122 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%628 = vector.extract %627[0] : vector<1xf32>
%629 = splat %628 : vector<4xf32>
%630 = vector.fma %629, %127, %626 : vector<4xf32>
%631 = vector.extract_strided_slice %122 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%632 = vector.extract %631[0] : vector<1xf32>
%633 = splat %632 : vector<4xf32>
%634 = vector.fma %633, %128, %630 : vector<4xf32>
%635 = vector.extract_strided_slice %122 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%636 = vector.extract %635[0] : vector<1xf32>
%637 = splat %636 : vector<4xf32>
%638 = vector.fma %637, %129, %634 : vector<4xf32>
%639 = vector.extract_strided_slice %122 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%640 = vector.extract %639[0] : vector<1xf32>
%641 = splat %640 : vector<4xf32>
%642 = vector.fma %641, %130, %638 : vector<4xf32>
vector.transfer_write %642, %84[%c15, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %610, %84[%c14, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %578, %84[%c13, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %546, %84[%c12, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %514, %84[%c11, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %482, %84[%c10, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %450, %84[%c9, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %418, %84[%c8, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %386, %84[%c7, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %354, %84[%c6, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %322, %84[%c5, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %290, %84[%c4, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %258, %84[%c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %226, %84[%c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %194, %84[%c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %162, %84[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
memref.global "private" @__shared_memory___0 : memref<8x128xf32, 3>
memref.global "private" @__shared_memory__ : memref<64x8xf32, 3>
func @_large_aligned_dispatch_0() {
%c4 = constant 4 : index
%c-1 = constant -1 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c-128 = constant -128 : index
%c-8 = constant -8 : index
%c2 = constant 2 : index
%c64 = constant 64 : index
%c16 = constant 16 : index
%cst = constant dense<0.000000e+00> : vector<4xf32>
%c128 = constant 128 : index
%c1016 = constant 1016 : index
%c8 = constant 8 : index
%cst_0 = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c5 = constant 5 : index
%c6 = constant 6 : index
%c7 = constant 7 : index
%c9 = constant 9 : index
%c10 = constant 10 : index
%c11 = constant 11 : index
%c12 = constant 12 : index
%c13 = constant 13 : index
%c14 = constant 14 : index
%c15 = constant 15 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = muli %workgroup_id_y, %c64 : index
%9 = muli %workgroup_count_y, %c64 : index
%10 = muli %workgroup_id_x, %c128 : index
%11 = muli %workgroup_count_x, %c128 : index
%12 = muli %1, %c16 : index
%13 = muli %0, %c4 : index
%14 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%15 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%16 = muli %1, %c16 : index
%17 = muli %2, %c64 : index
%18 = addi %16, %17 : index
%19 = cmpi slt, %0, %c0 : index
%20 = subi %c-1, %0 : index
%21 = select %19, %20, %0 : index
%22 = divi_signed %21, %c2 : index
%23 = subi %c-1, %22 : index
%24 = select %19, %23, %22 : index
%25 = addi %18, %24 : index
%26 = muli %0, %c4 : index
%27 = cmpi slt, %0, %c0 : index
%28 = subi %c-1, %0 : index
%29 = select %27, %28, %0 : index
%30 = divi_signed %29, %c2 : index
%31 = subi %c-1, %30 : index
%32 = select %27, %31, %30 : index
%33 = muli %32, %c-8 : index
%34 = addi %26, %33 : index
%35 = muli %2, %c4 : index
%36 = addi %1, %35 : index
%37 = cmpi slt, %0, %c0 : index
%38 = subi %c-1, %0 : index
%39 = select %37, %38, %0 : index
%40 = divi_signed %39, %c32 : index
%41 = subi %c-1, %40 : index
%42 = select %37, %41, %40 : index
%43 = addi %36, %42 : index
%44 = muli %0, %c4 : index
%45 = cmpi slt, %0, %c0 : index
%46 = subi %c-1, %0 : index
%47 = select %45, %46, %0 : index
%48 = divi_signed %47, %c32 : index
%49 = subi %c-1, %48 : index
%50 = select %45, %49, %48 : index
%51 = muli %50, %c-128 : index
%52 = addi %44, %51 : index
%53 = muli %2, %c4 : index
%54 = addi %1, %53 : index
%55 = cmpi slt, %0, %c0 : index
%56 = subi %c-1, %0 : index
%57 = select %55, %56, %0 : index
%58 = divi_signed %57, %c32 : index
%59 = subi %c-1, %58 : index
%60 = select %55, %59, %58 : index
%61 = addi %54, %60 : index
%62 = addi %61, %c4 : index
%63 = memref.subview %14[%12, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%64 = memref.subview %15[0, %13] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
scf.for %arg0 = %8 to %c2048 step %9 {
%65 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
scf.for %arg1 = %10 to %c512 step %11 {
%66 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%67 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%68 = memref.subview %67[%12, %13] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%69 = memref.subview %65[0, 0] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%70 = memref.subview %66[0, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%71 = vector.transfer_read %69[%25, %34], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%72 = vector.transfer_read %70[%43, %52], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%73 = vector.transfer_read %70[%62, %52], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%74:19 = scf.for %arg2 = %c0 to %c1016 step %c8 iter_args(%arg3 = %cst, %arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst, %arg8 = %cst, %arg9 = %cst, %arg10 = %cst, %arg11 = %cst, %arg12 = %cst, %arg13 = %cst, %arg14 = %cst, %arg15 = %cst, %arg16 = %cst, %arg17 = %cst, %arg18 = %cst, %arg19 = %71, %arg20 = %72, %arg21 = %73) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
gpu.barrier
vector.transfer_write %arg19, %14[%25, %34] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %arg20, %15[%43, %52] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %arg21, %15[%62, %52] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%627 = vector.transfer_read %63[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%628 = vector.transfer_read %63[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%629 = vector.transfer_read %63[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%630 = vector.transfer_read %63[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%631 = vector.transfer_read %63[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%632 = vector.transfer_read %63[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%633 = vector.transfer_read %63[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%634 = vector.transfer_read %63[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%635 = vector.transfer_read %63[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%636 = vector.transfer_read %63[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%637 = vector.transfer_read %63[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%638 = vector.transfer_read %63[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%639 = vector.transfer_read %63[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%640 = vector.transfer_read %63[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%641 = vector.transfer_read %63[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%642 = vector.transfer_read %63[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%643 = vector.transfer_read %63[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%644 = vector.transfer_read %63[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%645 = vector.transfer_read %63[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%646 = vector.transfer_read %63[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%647 = vector.transfer_read %63[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%648 = vector.transfer_read %63[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%649 = vector.transfer_read %63[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%650 = vector.transfer_read %63[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%651 = vector.transfer_read %63[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%652 = vector.transfer_read %63[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%653 = vector.transfer_read %63[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%654 = vector.transfer_read %63[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%655 = vector.transfer_read %63[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%656 = vector.transfer_read %63[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%657 = vector.transfer_read %63[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%658 = vector.transfer_read %63[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%659 = vector.transfer_read %64[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%660 = vector.transfer_read %64[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%661 = vector.transfer_read %64[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%662 = vector.transfer_read %64[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%663 = vector.transfer_read %64[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%664 = vector.transfer_read %64[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%665 = vector.transfer_read %64[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%666 = vector.transfer_read %64[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%667 = vector.extract_strided_slice %627 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%668 = vector.extract %667[0] : vector<1xf32>
%669 = splat %668 : vector<4xf32>
%670 = vector.fma %669, %659, %arg3 : vector<4xf32>
%671 = vector.extract_strided_slice %627 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%672 = vector.extract %671[0] : vector<1xf32>
%673 = splat %672 : vector<4xf32>
%674 = vector.fma %673, %660, %670 : vector<4xf32>
%675 = vector.extract_strided_slice %627 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%676 = vector.extract %675[0] : vector<1xf32>
%677 = splat %676 : vector<4xf32>
%678 = vector.fma %677, %661, %674 : vector<4xf32>
%679 = vector.extract_strided_slice %627 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%680 = vector.extract %679[0] : vector<1xf32>
%681 = splat %680 : vector<4xf32>
%682 = vector.fma %681, %662, %678 : vector<4xf32>
%683 = vector.extract_strided_slice %628 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%684 = vector.extract %683[0] : vector<1xf32>
%685 = splat %684 : vector<4xf32>
%686 = vector.fma %685, %663, %682 : vector<4xf32>
%687 = vector.extract_strided_slice %628 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%688 = vector.extract %687[0] : vector<1xf32>
%689 = splat %688 : vector<4xf32>
%690 = vector.fma %689, %664, %686 : vector<4xf32>
%691 = vector.extract_strided_slice %628 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%692 = vector.extract %691[0] : vector<1xf32>
%693 = splat %692 : vector<4xf32>
%694 = vector.fma %693, %665, %690 : vector<4xf32>
%695 = vector.extract_strided_slice %628 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%696 = vector.extract %695[0] : vector<1xf32>
%697 = splat %696 : vector<4xf32>
%698 = vector.fma %697, %666, %694 : vector<4xf32>
%699 = vector.extract_strided_slice %629 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%700 = vector.extract %699[0] : vector<1xf32>
%701 = splat %700 : vector<4xf32>
%702 = vector.fma %701, %659, %arg4 : vector<4xf32>
%703 = vector.extract_strided_slice %629 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%704 = vector.extract %703[0] : vector<1xf32>
%705 = splat %704 : vector<4xf32>
%706 = vector.fma %705, %660, %702 : vector<4xf32>
%707 = vector.extract_strided_slice %629 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%708 = vector.extract %707[0] : vector<1xf32>
%709 = splat %708 : vector<4xf32>
%710 = vector.fma %709, %661, %706 : vector<4xf32>
%711 = vector.extract_strided_slice %629 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%712 = vector.extract %711[0] : vector<1xf32>
%713 = splat %712 : vector<4xf32>
%714 = vector.fma %713, %662, %710 : vector<4xf32>
%715 = vector.extract_strided_slice %630 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%716 = vector.extract %715[0] : vector<1xf32>
%717 = splat %716 : vector<4xf32>
%718 = vector.fma %717, %663, %714 : vector<4xf32>
%719 = vector.extract_strided_slice %630 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%720 = vector.extract %719[0] : vector<1xf32>
%721 = splat %720 : vector<4xf32>
%722 = vector.fma %721, %664, %718 : vector<4xf32>
%723 = vector.extract_strided_slice %630 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%724 = vector.extract %723[0] : vector<1xf32>
%725 = splat %724 : vector<4xf32>
%726 = vector.fma %725, %665, %722 : vector<4xf32>
%727 = vector.extract_strided_slice %630 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%728 = vector.extract %727[0] : vector<1xf32>
%729 = splat %728 : vector<4xf32>
%730 = vector.fma %729, %666, %726 : vector<4xf32>
%731 = vector.extract_strided_slice %631 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%732 = vector.extract %731[0] : vector<1xf32>
%733 = splat %732 : vector<4xf32>
%734 = vector.fma %733, %659, %arg5 : vector<4xf32>
%735 = vector.extract_strided_slice %631 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%736 = vector.extract %735[0] : vector<1xf32>
%737 = splat %736 : vector<4xf32>
%738 = vector.fma %737, %660, %734 : vector<4xf32>
%739 = vector.extract_strided_slice %631 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%740 = vector.extract %739[0] : vector<1xf32>
%741 = splat %740 : vector<4xf32>
%742 = vector.fma %741, %661, %738 : vector<4xf32>
%743 = vector.extract_strided_slice %631 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%744 = vector.extract %743[0] : vector<1xf32>
%745 = splat %744 : vector<4xf32>
%746 = vector.fma %745, %662, %742 : vector<4xf32>
%747 = vector.extract_strided_slice %632 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%748 = vector.extract %747[0] : vector<1xf32>
%749 = splat %748 : vector<4xf32>
%750 = vector.fma %749, %663, %746 : vector<4xf32>
%751 = vector.extract_strided_slice %632 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%752 = vector.extract %751[0] : vector<1xf32>
%753 = splat %752 : vector<4xf32>
%754 = vector.fma %753, %664, %750 : vector<4xf32>
%755 = vector.extract_strided_slice %632 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%756 = vector.extract %755[0] : vector<1xf32>
%757 = splat %756 : vector<4xf32>
%758 = vector.fma %757, %665, %754 : vector<4xf32>
%759 = vector.extract_strided_slice %632 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%760 = vector.extract %759[0] : vector<1xf32>
%761 = splat %760 : vector<4xf32>
%762 = vector.fma %761, %666, %758 : vector<4xf32>
%763 = vector.extract_strided_slice %633 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%764 = vector.extract %763[0] : vector<1xf32>
%765 = splat %764 : vector<4xf32>
%766 = vector.fma %765, %659, %arg6 : vector<4xf32>
%767 = vector.extract_strided_slice %633 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%768 = vector.extract %767[0] : vector<1xf32>
%769 = splat %768 : vector<4xf32>
%770 = vector.fma %769, %660, %766 : vector<4xf32>
%771 = vector.extract_strided_slice %633 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%772 = vector.extract %771[0] : vector<1xf32>
%773 = splat %772 : vector<4xf32>
%774 = vector.fma %773, %661, %770 : vector<4xf32>
%775 = vector.extract_strided_slice %633 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%776 = vector.extract %775[0] : vector<1xf32>
%777 = splat %776 : vector<4xf32>
%778 = vector.fma %777, %662, %774 : vector<4xf32>
%779 = vector.extract_strided_slice %634 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%780 = vector.extract %779[0] : vector<1xf32>
%781 = splat %780 : vector<4xf32>
%782 = vector.fma %781, %663, %778 : vector<4xf32>
%783 = vector.extract_strided_slice %634 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%784 = vector.extract %783[0] : vector<1xf32>
%785 = splat %784 : vector<4xf32>
%786 = vector.fma %785, %664, %782 : vector<4xf32>
%787 = vector.extract_strided_slice %634 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%788 = vector.extract %787[0] : vector<1xf32>
%789 = splat %788 : vector<4xf32>
%790 = vector.fma %789, %665, %786 : vector<4xf32>
%791 = vector.extract_strided_slice %634 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%792 = vector.extract %791[0] : vector<1xf32>
%793 = splat %792 : vector<4xf32>
%794 = vector.fma %793, %666, %790 : vector<4xf32>
%795 = vector.extract_strided_slice %635 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%796 = vector.extract %795[0] : vector<1xf32>
%797 = splat %796 : vector<4xf32>
%798 = vector.fma %797, %659, %arg7 : vector<4xf32>
%799 = vector.extract_strided_slice %635 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%800 = vector.extract %799[0] : vector<1xf32>
%801 = splat %800 : vector<4xf32>
%802 = vector.fma %801, %660, %798 : vector<4xf32>
%803 = vector.extract_strided_slice %635 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%804 = vector.extract %803[0] : vector<1xf32>
%805 = splat %804 : vector<4xf32>
%806 = vector.fma %805, %661, %802 : vector<4xf32>
%807 = vector.extract_strided_slice %635 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%808 = vector.extract %807[0] : vector<1xf32>
%809 = splat %808 : vector<4xf32>
%810 = vector.fma %809, %662, %806 : vector<4xf32>
%811 = vector.extract_strided_slice %636 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%812 = vector.extract %811[0] : vector<1xf32>
%813 = splat %812 : vector<4xf32>
%814 = vector.fma %813, %663, %810 : vector<4xf32>
%815 = vector.extract_strided_slice %636 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%816 = vector.extract %815[0] : vector<1xf32>
%817 = splat %816 : vector<4xf32>
%818 = vector.fma %817, %664, %814 : vector<4xf32>
%819 = vector.extract_strided_slice %636 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%820 = vector.extract %819[0] : vector<1xf32>
%821 = splat %820 : vector<4xf32>
%822 = vector.fma %821, %665, %818 : vector<4xf32>
%823 = vector.extract_strided_slice %636 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%824 = vector.extract %823[0] : vector<1xf32>
%825 = splat %824 : vector<4xf32>
%826 = vector.fma %825, %666, %822 : vector<4xf32>
%827 = vector.extract_strided_slice %637 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%828 = vector.extract %827[0] : vector<1xf32>
%829 = splat %828 : vector<4xf32>
%830 = vector.fma %829, %659, %arg8 : vector<4xf32>
%831 = vector.extract_strided_slice %637 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%832 = vector.extract %831[0] : vector<1xf32>
%833 = splat %832 : vector<4xf32>
%834 = vector.fma %833, %660, %830 : vector<4xf32>
%835 = vector.extract_strided_slice %637 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%836 = vector.extract %835[0] : vector<1xf32>
%837 = splat %836 : vector<4xf32>
%838 = vector.fma %837, %661, %834 : vector<4xf32>
%839 = vector.extract_strided_slice %637 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%840 = vector.extract %839[0] : vector<1xf32>
%841 = splat %840 : vector<4xf32>
%842 = vector.fma %841, %662, %838 : vector<4xf32>
%843 = vector.extract_strided_slice %638 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%844 = vector.extract %843[0] : vector<1xf32>
%845 = splat %844 : vector<4xf32>
%846 = vector.fma %845, %663, %842 : vector<4xf32>
%847 = vector.extract_strided_slice %638 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%848 = vector.extract %847[0] : vector<1xf32>
%849 = splat %848 : vector<4xf32>
%850 = vector.fma %849, %664, %846 : vector<4xf32>
%851 = vector.extract_strided_slice %638 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%852 = vector.extract %851[0] : vector<1xf32>
%853 = splat %852 : vector<4xf32>
%854 = vector.fma %853, %665, %850 : vector<4xf32>
%855 = vector.extract_strided_slice %638 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%856 = vector.extract %855[0] : vector<1xf32>
%857 = splat %856 : vector<4xf32>
%858 = vector.fma %857, %666, %854 : vector<4xf32>
%859 = vector.extract_strided_slice %639 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%860 = vector.extract %859[0] : vector<1xf32>
%861 = splat %860 : vector<4xf32>
%862 = vector.fma %861, %659, %arg9 : vector<4xf32>
%863 = vector.extract_strided_slice %639 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%864 = vector.extract %863[0] : vector<1xf32>
%865 = splat %864 : vector<4xf32>
%866 = vector.fma %865, %660, %862 : vector<4xf32>
%867 = vector.extract_strided_slice %639 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%868 = vector.extract %867[0] : vector<1xf32>
%869 = splat %868 : vector<4xf32>
%870 = vector.fma %869, %661, %866 : vector<4xf32>
%871 = vector.extract_strided_slice %639 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%872 = vector.extract %871[0] : vector<1xf32>
%873 = splat %872 : vector<4xf32>
%874 = vector.fma %873, %662, %870 : vector<4xf32>
%875 = vector.extract_strided_slice %640 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%876 = vector.extract %875[0] : vector<1xf32>
%877 = splat %876 : vector<4xf32>
%878 = vector.fma %877, %663, %874 : vector<4xf32>
%879 = vector.extract_strided_slice %640 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%880 = vector.extract %879[0] : vector<1xf32>
%881 = splat %880 : vector<4xf32>
%882 = vector.fma %881, %664, %878 : vector<4xf32>
%883 = vector.extract_strided_slice %640 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%884 = vector.extract %883[0] : vector<1xf32>
%885 = splat %884 : vector<4xf32>
%886 = vector.fma %885, %665, %882 : vector<4xf32>
%887 = vector.extract_strided_slice %640 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%888 = vector.extract %887[0] : vector<1xf32>
%889 = splat %888 : vector<4xf32>
%890 = vector.fma %889, %666, %886 : vector<4xf32>
%891 = vector.extract_strided_slice %641 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%892 = vector.extract %891[0] : vector<1xf32>
%893 = splat %892 : vector<4xf32>
%894 = vector.fma %893, %659, %arg10 : vector<4xf32>
%895 = vector.extract_strided_slice %641 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%896 = vector.extract %895[0] : vector<1xf32>
%897 = splat %896 : vector<4xf32>
%898 = vector.fma %897, %660, %894 : vector<4xf32>
%899 = vector.extract_strided_slice %641 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%900 = vector.extract %899[0] : vector<1xf32>
%901 = splat %900 : vector<4xf32>
%902 = vector.fma %901, %661, %898 : vector<4xf32>
%903 = vector.extract_strided_slice %641 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%904 = vector.extract %903[0] : vector<1xf32>
%905 = splat %904 : vector<4xf32>
%906 = vector.fma %905, %662, %902 : vector<4xf32>
%907 = vector.extract_strided_slice %642 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%908 = vector.extract %907[0] : vector<1xf32>
%909 = splat %908 : vector<4xf32>
%910 = vector.fma %909, %663, %906 : vector<4xf32>
%911 = vector.extract_strided_slice %642 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%912 = vector.extract %911[0] : vector<1xf32>
%913 = splat %912 : vector<4xf32>
%914 = vector.fma %913, %664, %910 : vector<4xf32>
%915 = vector.extract_strided_slice %642 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%916 = vector.extract %915[0] : vector<1xf32>
%917 = splat %916 : vector<4xf32>
%918 = vector.fma %917, %665, %914 : vector<4xf32>
%919 = vector.extract_strided_slice %642 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%920 = vector.extract %919[0] : vector<1xf32>
%921 = splat %920 : vector<4xf32>
%922 = vector.fma %921, %666, %918 : vector<4xf32>
%923 = vector.extract_strided_slice %643 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%924 = vector.extract %923[0] : vector<1xf32>
%925 = splat %924 : vector<4xf32>
%926 = vector.fma %925, %659, %arg11 : vector<4xf32>
%927 = vector.extract_strided_slice %643 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%928 = vector.extract %927[0] : vector<1xf32>
%929 = splat %928 : vector<4xf32>
%930 = vector.fma %929, %660, %926 : vector<4xf32>
%931 = vector.extract_strided_slice %643 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%932 = vector.extract %931[0] : vector<1xf32>
%933 = splat %932 : vector<4xf32>
%934 = vector.fma %933, %661, %930 : vector<4xf32>
%935 = vector.extract_strided_slice %643 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%936 = vector.extract %935[0] : vector<1xf32>
%937 = splat %936 : vector<4xf32>
%938 = vector.fma %937, %662, %934 : vector<4xf32>
%939 = vector.extract_strided_slice %644 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%940 = vector.extract %939[0] : vector<1xf32>
%941 = splat %940 : vector<4xf32>
%942 = vector.fma %941, %663, %938 : vector<4xf32>
%943 = vector.extract_strided_slice %644 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%944 = vector.extract %943[0] : vector<1xf32>
%945 = splat %944 : vector<4xf32>
%946 = vector.fma %945, %664, %942 : vector<4xf32>
%947 = vector.extract_strided_slice %644 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%948 = vector.extract %947[0] : vector<1xf32>
%949 = splat %948 : vector<4xf32>
%950 = vector.fma %949, %665, %946 : vector<4xf32>
%951 = vector.extract_strided_slice %644 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%952 = vector.extract %951[0] : vector<1xf32>
%953 = splat %952 : vector<4xf32>
%954 = vector.fma %953, %666, %950 : vector<4xf32>
%955 = vector.extract_strided_slice %645 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%956 = vector.extract %955[0] : vector<1xf32>
%957 = splat %956 : vector<4xf32>
%958 = vector.fma %957, %659, %arg12 : vector<4xf32>
%959 = vector.extract_strided_slice %645 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%960 = vector.extract %959[0] : vector<1xf32>
%961 = splat %960 : vector<4xf32>
%962 = vector.fma %961, %660, %958 : vector<4xf32>
%963 = vector.extract_strided_slice %645 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%964 = vector.extract %963[0] : vector<1xf32>
%965 = splat %964 : vector<4xf32>
%966 = vector.fma %965, %661, %962 : vector<4xf32>
%967 = vector.extract_strided_slice %645 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%968 = vector.extract %967[0] : vector<1xf32>
%969 = splat %968 : vector<4xf32>
%970 = vector.fma %969, %662, %966 : vector<4xf32>
%971 = vector.extract_strided_slice %646 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%972 = vector.extract %971[0] : vector<1xf32>
%973 = splat %972 : vector<4xf32>
%974 = vector.fma %973, %663, %970 : vector<4xf32>
%975 = vector.extract_strided_slice %646 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%976 = vector.extract %975[0] : vector<1xf32>
%977 = splat %976 : vector<4xf32>
%978 = vector.fma %977, %664, %974 : vector<4xf32>
%979 = vector.extract_strided_slice %646 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%980 = vector.extract %979[0] : vector<1xf32>
%981 = splat %980 : vector<4xf32>
%982 = vector.fma %981, %665, %978 : vector<4xf32>
%983 = vector.extract_strided_slice %646 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%984 = vector.extract %983[0] : vector<1xf32>
%985 = splat %984 : vector<4xf32>
%986 = vector.fma %985, %666, %982 : vector<4xf32>
%987 = vector.extract_strided_slice %647 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%988 = vector.extract %987[0] : vector<1xf32>
%989 = splat %988 : vector<4xf32>
%990 = vector.fma %989, %659, %arg13 : vector<4xf32>
%991 = vector.extract_strided_slice %647 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%992 = vector.extract %991[0] : vector<1xf32>
%993 = splat %992 : vector<4xf32>
%994 = vector.fma %993, %660, %990 : vector<4xf32>
%995 = vector.extract_strided_slice %647 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%996 = vector.extract %995[0] : vector<1xf32>
%997 = splat %996 : vector<4xf32>
%998 = vector.fma %997, %661, %994 : vector<4xf32>
%999 = vector.extract_strided_slice %647 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1000 = vector.extract %999[0] : vector<1xf32>
%1001 = splat %1000 : vector<4xf32>
%1002 = vector.fma %1001, %662, %998 : vector<4xf32>
%1003 = vector.extract_strided_slice %648 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1004 = vector.extract %1003[0] : vector<1xf32>
%1005 = splat %1004 : vector<4xf32>
%1006 = vector.fma %1005, %663, %1002 : vector<4xf32>
%1007 = vector.extract_strided_slice %648 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1008 = vector.extract %1007[0] : vector<1xf32>
%1009 = splat %1008 : vector<4xf32>
%1010 = vector.fma %1009, %664, %1006 : vector<4xf32>
%1011 = vector.extract_strided_slice %648 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1012 = vector.extract %1011[0] : vector<1xf32>
%1013 = splat %1012 : vector<4xf32>
%1014 = vector.fma %1013, %665, %1010 : vector<4xf32>
%1015 = vector.extract_strided_slice %648 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1016 = vector.extract %1015[0] : vector<1xf32>
%1017 = splat %1016 : vector<4xf32>
%1018 = vector.fma %1017, %666, %1014 : vector<4xf32>
%1019 = vector.extract_strided_slice %649 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1020 = vector.extract %1019[0] : vector<1xf32>
%1021 = splat %1020 : vector<4xf32>
%1022 = vector.fma %1021, %659, %arg14 : vector<4xf32>
%1023 = vector.extract_strided_slice %649 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1024 = vector.extract %1023[0] : vector<1xf32>
%1025 = splat %1024 : vector<4xf32>
%1026 = vector.fma %1025, %660, %1022 : vector<4xf32>
%1027 = vector.extract_strided_slice %649 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1028 = vector.extract %1027[0] : vector<1xf32>
%1029 = splat %1028 : vector<4xf32>
%1030 = vector.fma %1029, %661, %1026 : vector<4xf32>
%1031 = vector.extract_strided_slice %649 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1032 = vector.extract %1031[0] : vector<1xf32>
%1033 = splat %1032 : vector<4xf32>
%1034 = vector.fma %1033, %662, %1030 : vector<4xf32>
%1035 = vector.extract_strided_slice %650 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1036 = vector.extract %1035[0] : vector<1xf32>
%1037 = splat %1036 : vector<4xf32>
%1038 = vector.fma %1037, %663, %1034 : vector<4xf32>
%1039 = vector.extract_strided_slice %650 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1040 = vector.extract %1039[0] : vector<1xf32>
%1041 = splat %1040 : vector<4xf32>
%1042 = vector.fma %1041, %664, %1038 : vector<4xf32>
%1043 = vector.extract_strided_slice %650 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1044 = vector.extract %1043[0] : vector<1xf32>
%1045 = splat %1044 : vector<4xf32>
%1046 = vector.fma %1045, %665, %1042 : vector<4xf32>
%1047 = vector.extract_strided_slice %650 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1048 = vector.extract %1047[0] : vector<1xf32>
%1049 = splat %1048 : vector<4xf32>
%1050 = vector.fma %1049, %666, %1046 : vector<4xf32>
%1051 = vector.extract_strided_slice %651 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1052 = vector.extract %1051[0] : vector<1xf32>
%1053 = splat %1052 : vector<4xf32>
%1054 = vector.fma %1053, %659, %arg15 : vector<4xf32>
%1055 = vector.extract_strided_slice %651 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1056 = vector.extract %1055[0] : vector<1xf32>
%1057 = splat %1056 : vector<4xf32>
%1058 = vector.fma %1057, %660, %1054 : vector<4xf32>
%1059 = vector.extract_strided_slice %651 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1060 = vector.extract %1059[0] : vector<1xf32>
%1061 = splat %1060 : vector<4xf32>
%1062 = vector.fma %1061, %661, %1058 : vector<4xf32>
%1063 = vector.extract_strided_slice %651 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1064 = vector.extract %1063[0] : vector<1xf32>
%1065 = splat %1064 : vector<4xf32>
%1066 = vector.fma %1065, %662, %1062 : vector<4xf32>
%1067 = vector.extract_strided_slice %652 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1068 = vector.extract %1067[0] : vector<1xf32>
%1069 = splat %1068 : vector<4xf32>
%1070 = vector.fma %1069, %663, %1066 : vector<4xf32>
%1071 = vector.extract_strided_slice %652 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1072 = vector.extract %1071[0] : vector<1xf32>
%1073 = splat %1072 : vector<4xf32>
%1074 = vector.fma %1073, %664, %1070 : vector<4xf32>
%1075 = vector.extract_strided_slice %652 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1076 = vector.extract %1075[0] : vector<1xf32>
%1077 = splat %1076 : vector<4xf32>
%1078 = vector.fma %1077, %665, %1074 : vector<4xf32>
%1079 = vector.extract_strided_slice %652 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1080 = vector.extract %1079[0] : vector<1xf32>
%1081 = splat %1080 : vector<4xf32>
%1082 = vector.fma %1081, %666, %1078 : vector<4xf32>
%1083 = vector.extract_strided_slice %653 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1084 = vector.extract %1083[0] : vector<1xf32>
%1085 = splat %1084 : vector<4xf32>
%1086 = vector.fma %1085, %659, %arg16 : vector<4xf32>
%1087 = vector.extract_strided_slice %653 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1088 = vector.extract %1087[0] : vector<1xf32>
%1089 = splat %1088 : vector<4xf32>
%1090 = vector.fma %1089, %660, %1086 : vector<4xf32>
%1091 = vector.extract_strided_slice %653 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1092 = vector.extract %1091[0] : vector<1xf32>
%1093 = splat %1092 : vector<4xf32>
%1094 = vector.fma %1093, %661, %1090 : vector<4xf32>
%1095 = vector.extract_strided_slice %653 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1096 = vector.extract %1095[0] : vector<1xf32>
%1097 = splat %1096 : vector<4xf32>
%1098 = vector.fma %1097, %662, %1094 : vector<4xf32>
%1099 = vector.extract_strided_slice %654 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1100 = vector.extract %1099[0] : vector<1xf32>
%1101 = splat %1100 : vector<4xf32>
%1102 = vector.fma %1101, %663, %1098 : vector<4xf32>
%1103 = vector.extract_strided_slice %654 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1104 = vector.extract %1103[0] : vector<1xf32>
%1105 = splat %1104 : vector<4xf32>
%1106 = vector.fma %1105, %664, %1102 : vector<4xf32>
%1107 = vector.extract_strided_slice %654 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1108 = vector.extract %1107[0] : vector<1xf32>
%1109 = splat %1108 : vector<4xf32>
%1110 = vector.fma %1109, %665, %1106 : vector<4xf32>
%1111 = vector.extract_strided_slice %654 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1112 = vector.extract %1111[0] : vector<1xf32>
%1113 = splat %1112 : vector<4xf32>
%1114 = vector.fma %1113, %666, %1110 : vector<4xf32>
%1115 = vector.extract_strided_slice %655 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1116 = vector.extract %1115[0] : vector<1xf32>
%1117 = splat %1116 : vector<4xf32>
%1118 = vector.fma %1117, %659, %arg17 : vector<4xf32>
%1119 = vector.extract_strided_slice %655 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1120 = vector.extract %1119[0] : vector<1xf32>
%1121 = splat %1120 : vector<4xf32>
%1122 = vector.fma %1121, %660, %1118 : vector<4xf32>
%1123 = vector.extract_strided_slice %655 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1124 = vector.extract %1123[0] : vector<1xf32>
%1125 = splat %1124 : vector<4xf32>
%1126 = vector.fma %1125, %661, %1122 : vector<4xf32>
%1127 = vector.extract_strided_slice %655 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1128 = vector.extract %1127[0] : vector<1xf32>
%1129 = splat %1128 : vector<4xf32>
%1130 = vector.fma %1129, %662, %1126 : vector<4xf32>
%1131 = vector.extract_strided_slice %656 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1132 = vector.extract %1131[0] : vector<1xf32>
%1133 = splat %1132 : vector<4xf32>
%1134 = vector.fma %1133, %663, %1130 : vector<4xf32>
%1135 = vector.extract_strided_slice %656 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1136 = vector.extract %1135[0] : vector<1xf32>
%1137 = splat %1136 : vector<4xf32>
%1138 = vector.fma %1137, %664, %1134 : vector<4xf32>
%1139 = vector.extract_strided_slice %656 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1140 = vector.extract %1139[0] : vector<1xf32>
%1141 = splat %1140 : vector<4xf32>
%1142 = vector.fma %1141, %665, %1138 : vector<4xf32>
%1143 = vector.extract_strided_slice %656 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1144 = vector.extract %1143[0] : vector<1xf32>
%1145 = splat %1144 : vector<4xf32>
%1146 = vector.fma %1145, %666, %1142 : vector<4xf32>
%1147 = vector.extract_strided_slice %657 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1148 = vector.extract %1147[0] : vector<1xf32>
%1149 = splat %1148 : vector<4xf32>
%1150 = vector.fma %1149, %659, %arg18 : vector<4xf32>
%1151 = vector.extract_strided_slice %657 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1152 = vector.extract %1151[0] : vector<1xf32>
%1153 = splat %1152 : vector<4xf32>
%1154 = vector.fma %1153, %660, %1150 : vector<4xf32>
%1155 = vector.extract_strided_slice %657 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1156 = vector.extract %1155[0] : vector<1xf32>
%1157 = splat %1156 : vector<4xf32>
%1158 = vector.fma %1157, %661, %1154 : vector<4xf32>
%1159 = vector.extract_strided_slice %657 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1160 = vector.extract %1159[0] : vector<1xf32>
%1161 = splat %1160 : vector<4xf32>
%1162 = vector.fma %1161, %662, %1158 : vector<4xf32>
%1163 = vector.extract_strided_slice %658 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1164 = vector.extract %1163[0] : vector<1xf32>
%1165 = splat %1164 : vector<4xf32>
%1166 = vector.fma %1165, %663, %1162 : vector<4xf32>
%1167 = vector.extract_strided_slice %658 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1168 = vector.extract %1167[0] : vector<1xf32>
%1169 = splat %1168 : vector<4xf32>
%1170 = vector.fma %1169, %664, %1166 : vector<4xf32>
%1171 = vector.extract_strided_slice %658 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1172 = vector.extract %1171[0] : vector<1xf32>
%1173 = splat %1172 : vector<4xf32>
%1174 = vector.fma %1173, %665, %1170 : vector<4xf32>
%1175 = vector.extract_strided_slice %658 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1176 = vector.extract %1175[0] : vector<1xf32>
%1177 = splat %1176 : vector<4xf32>
%1178 = vector.fma %1177, %666, %1174 : vector<4xf32>
%1179 = addi %arg2, %c8 : index
%1180 = memref.subview %65[0, %1179] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%1181 = addi %arg2, %c8 : index
%1182 = memref.subview %66[%1181, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%1183 = vector.transfer_read %1180[%25, %34], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%1184 = vector.transfer_read %1182[%43, %52], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%1185 = vector.transfer_read %1182[%62, %52], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
scf.yield %698, %730, %762, %794, %826, %858, %890, %922, %954, %986, %1018, %1050, %1082, %1114, %1146, %1178, %1183, %1184, %1185 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
gpu.barrier
vector.transfer_write %74#16, %14[%25, %34] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %74#17, %15[%43, %52] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %74#18, %15[%62, %52] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%75 = vector.transfer_read %63[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%76 = vector.transfer_read %63[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%77 = vector.transfer_read %63[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%78 = vector.transfer_read %63[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%79 = vector.transfer_read %63[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%80 = vector.transfer_read %63[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%81 = vector.transfer_read %63[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%82 = vector.transfer_read %63[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%83 = vector.transfer_read %63[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%84 = vector.transfer_read %63[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%85 = vector.transfer_read %63[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%86 = vector.transfer_read %63[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%87 = vector.transfer_read %63[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%88 = vector.transfer_read %63[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%89 = vector.transfer_read %63[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%90 = vector.transfer_read %63[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%91 = vector.transfer_read %63[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%92 = vector.transfer_read %63[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%93 = vector.transfer_read %63[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%94 = vector.transfer_read %63[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%95 = vector.transfer_read %63[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%96 = vector.transfer_read %63[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%97 = vector.transfer_read %63[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%98 = vector.transfer_read %63[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%99 = vector.transfer_read %63[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%100 = vector.transfer_read %63[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%101 = vector.transfer_read %63[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%102 = vector.transfer_read %63[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%103 = vector.transfer_read %63[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%104 = vector.transfer_read %63[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%105 = vector.transfer_read %63[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%106 = vector.transfer_read %63[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%107 = vector.transfer_read %64[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%108 = vector.transfer_read %64[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%109 = vector.transfer_read %64[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%110 = vector.transfer_read %64[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%111 = vector.transfer_read %64[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%112 = vector.transfer_read %64[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%113 = vector.transfer_read %64[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%114 = vector.transfer_read %64[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%115 = vector.extract_strided_slice %75 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%116 = vector.extract %115[0] : vector<1xf32>
%117 = splat %116 : vector<4xf32>
%118 = vector.fma %117, %107, %74#0 : vector<4xf32>
%119 = vector.extract_strided_slice %75 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%120 = vector.extract %119[0] : vector<1xf32>
%121 = splat %120 : vector<4xf32>
%122 = vector.fma %121, %108, %118 : vector<4xf32>
%123 = vector.extract_strided_slice %75 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%124 = vector.extract %123[0] : vector<1xf32>
%125 = splat %124 : vector<4xf32>
%126 = vector.fma %125, %109, %122 : vector<4xf32>
%127 = vector.extract_strided_slice %75 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%128 = vector.extract %127[0] : vector<1xf32>
%129 = splat %128 : vector<4xf32>
%130 = vector.fma %129, %110, %126 : vector<4xf32>
%131 = vector.extract_strided_slice %76 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%132 = vector.extract %131[0] : vector<1xf32>
%133 = splat %132 : vector<4xf32>
%134 = vector.fma %133, %111, %130 : vector<4xf32>
%135 = vector.extract_strided_slice %76 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%136 = vector.extract %135[0] : vector<1xf32>
%137 = splat %136 : vector<4xf32>
%138 = vector.fma %137, %112, %134 : vector<4xf32>
%139 = vector.extract_strided_slice %76 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%140 = vector.extract %139[0] : vector<1xf32>
%141 = splat %140 : vector<4xf32>
%142 = vector.fma %141, %113, %138 : vector<4xf32>
%143 = vector.extract_strided_slice %76 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%144 = vector.extract %143[0] : vector<1xf32>
%145 = splat %144 : vector<4xf32>
%146 = vector.fma %145, %114, %142 : vector<4xf32>
%147 = vector.extract_strided_slice %77 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%148 = vector.extract %147[0] : vector<1xf32>
%149 = splat %148 : vector<4xf32>
%150 = vector.fma %149, %107, %74#1 : vector<4xf32>
%151 = vector.extract_strided_slice %77 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%152 = vector.extract %151[0] : vector<1xf32>
%153 = splat %152 : vector<4xf32>
%154 = vector.fma %153, %108, %150 : vector<4xf32>
%155 = vector.extract_strided_slice %77 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%156 = vector.extract %155[0] : vector<1xf32>
%157 = splat %156 : vector<4xf32>
%158 = vector.fma %157, %109, %154 : vector<4xf32>
%159 = vector.extract_strided_slice %77 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%160 = vector.extract %159[0] : vector<1xf32>
%161 = splat %160 : vector<4xf32>
%162 = vector.fma %161, %110, %158 : vector<4xf32>
%163 = vector.extract_strided_slice %78 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%164 = vector.extract %163[0] : vector<1xf32>
%165 = splat %164 : vector<4xf32>
%166 = vector.fma %165, %111, %162 : vector<4xf32>
%167 = vector.extract_strided_slice %78 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%168 = vector.extract %167[0] : vector<1xf32>
%169 = splat %168 : vector<4xf32>
%170 = vector.fma %169, %112, %166 : vector<4xf32>
%171 = vector.extract_strided_slice %78 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%172 = vector.extract %171[0] : vector<1xf32>
%173 = splat %172 : vector<4xf32>
%174 = vector.fma %173, %113, %170 : vector<4xf32>
%175 = vector.extract_strided_slice %78 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%176 = vector.extract %175[0] : vector<1xf32>
%177 = splat %176 : vector<4xf32>
%178 = vector.fma %177, %114, %174 : vector<4xf32>
%179 = vector.extract_strided_slice %79 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%180 = vector.extract %179[0] : vector<1xf32>
%181 = splat %180 : vector<4xf32>
%182 = vector.fma %181, %107, %74#2 : vector<4xf32>
%183 = vector.extract_strided_slice %79 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%184 = vector.extract %183[0] : vector<1xf32>
%185 = splat %184 : vector<4xf32>
%186 = vector.fma %185, %108, %182 : vector<4xf32>
%187 = vector.extract_strided_slice %79 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%188 = vector.extract %187[0] : vector<1xf32>
%189 = splat %188 : vector<4xf32>
%190 = vector.fma %189, %109, %186 : vector<4xf32>
%191 = vector.extract_strided_slice %79 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%192 = vector.extract %191[0] : vector<1xf32>
%193 = splat %192 : vector<4xf32>
%194 = vector.fma %193, %110, %190 : vector<4xf32>
%195 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%196 = vector.extract %195[0] : vector<1xf32>
%197 = splat %196 : vector<4xf32>
%198 = vector.fma %197, %111, %194 : vector<4xf32>
%199 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%200 = vector.extract %199[0] : vector<1xf32>
%201 = splat %200 : vector<4xf32>
%202 = vector.fma %201, %112, %198 : vector<4xf32>
%203 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%204 = vector.extract %203[0] : vector<1xf32>
%205 = splat %204 : vector<4xf32>
%206 = vector.fma %205, %113, %202 : vector<4xf32>
%207 = vector.extract_strided_slice %80 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%208 = vector.extract %207[0] : vector<1xf32>
%209 = splat %208 : vector<4xf32>
%210 = vector.fma %209, %114, %206 : vector<4xf32>
%211 = vector.extract_strided_slice %81 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%212 = vector.extract %211[0] : vector<1xf32>
%213 = splat %212 : vector<4xf32>
%214 = vector.fma %213, %107, %74#3 : vector<4xf32>
%215 = vector.extract_strided_slice %81 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%216 = vector.extract %215[0] : vector<1xf32>
%217 = splat %216 : vector<4xf32>
%218 = vector.fma %217, %108, %214 : vector<4xf32>
%219 = vector.extract_strided_slice %81 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%220 = vector.extract %219[0] : vector<1xf32>
%221 = splat %220 : vector<4xf32>
%222 = vector.fma %221, %109, %218 : vector<4xf32>
%223 = vector.extract_strided_slice %81 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%224 = vector.extract %223[0] : vector<1xf32>
%225 = splat %224 : vector<4xf32>
%226 = vector.fma %225, %110, %222 : vector<4xf32>
%227 = vector.extract_strided_slice %82 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%228 = vector.extract %227[0] : vector<1xf32>
%229 = splat %228 : vector<4xf32>
%230 = vector.fma %229, %111, %226 : vector<4xf32>
%231 = vector.extract_strided_slice %82 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%232 = vector.extract %231[0] : vector<1xf32>
%233 = splat %232 : vector<4xf32>
%234 = vector.fma %233, %112, %230 : vector<4xf32>
%235 = vector.extract_strided_slice %82 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%236 = vector.extract %235[0] : vector<1xf32>
%237 = splat %236 : vector<4xf32>
%238 = vector.fma %237, %113, %234 : vector<4xf32>
%239 = vector.extract_strided_slice %82 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%240 = vector.extract %239[0] : vector<1xf32>
%241 = splat %240 : vector<4xf32>
%242 = vector.fma %241, %114, %238 : vector<4xf32>
%243 = vector.extract_strided_slice %83 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%244 = vector.extract %243[0] : vector<1xf32>
%245 = splat %244 : vector<4xf32>
%246 = vector.fma %245, %107, %74#4 : vector<4xf32>
%247 = vector.extract_strided_slice %83 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%248 = vector.extract %247[0] : vector<1xf32>
%249 = splat %248 : vector<4xf32>
%250 = vector.fma %249, %108, %246 : vector<4xf32>
%251 = vector.extract_strided_slice %83 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%252 = vector.extract %251[0] : vector<1xf32>
%253 = splat %252 : vector<4xf32>
%254 = vector.fma %253, %109, %250 : vector<4xf32>
%255 = vector.extract_strided_slice %83 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%256 = vector.extract %255[0] : vector<1xf32>
%257 = splat %256 : vector<4xf32>
%258 = vector.fma %257, %110, %254 : vector<4xf32>
%259 = vector.extract_strided_slice %84 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%260 = vector.extract %259[0] : vector<1xf32>
%261 = splat %260 : vector<4xf32>
%262 = vector.fma %261, %111, %258 : vector<4xf32>
%263 = vector.extract_strided_slice %84 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%264 = vector.extract %263[0] : vector<1xf32>
%265 = splat %264 : vector<4xf32>
%266 = vector.fma %265, %112, %262 : vector<4xf32>
%267 = vector.extract_strided_slice %84 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%268 = vector.extract %267[0] : vector<1xf32>
%269 = splat %268 : vector<4xf32>
%270 = vector.fma %269, %113, %266 : vector<4xf32>
%271 = vector.extract_strided_slice %84 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%272 = vector.extract %271[0] : vector<1xf32>
%273 = splat %272 : vector<4xf32>
%274 = vector.fma %273, %114, %270 : vector<4xf32>
%275 = vector.extract_strided_slice %85 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%276 = vector.extract %275[0] : vector<1xf32>
%277 = splat %276 : vector<4xf32>
%278 = vector.fma %277, %107, %74#5 : vector<4xf32>
%279 = vector.extract_strided_slice %85 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%280 = vector.extract %279[0] : vector<1xf32>
%281 = splat %280 : vector<4xf32>
%282 = vector.fma %281, %108, %278 : vector<4xf32>
%283 = vector.extract_strided_slice %85 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%284 = vector.extract %283[0] : vector<1xf32>
%285 = splat %284 : vector<4xf32>
%286 = vector.fma %285, %109, %282 : vector<4xf32>
%287 = vector.extract_strided_slice %85 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%288 = vector.extract %287[0] : vector<1xf32>
%289 = splat %288 : vector<4xf32>
%290 = vector.fma %289, %110, %286 : vector<4xf32>
%291 = vector.extract_strided_slice %86 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%292 = vector.extract %291[0] : vector<1xf32>
%293 = splat %292 : vector<4xf32>
%294 = vector.fma %293, %111, %290 : vector<4xf32>
%295 = vector.extract_strided_slice %86 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%296 = vector.extract %295[0] : vector<1xf32>
%297 = splat %296 : vector<4xf32>
%298 = vector.fma %297, %112, %294 : vector<4xf32>
%299 = vector.extract_strided_slice %86 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%300 = vector.extract %299[0] : vector<1xf32>
%301 = splat %300 : vector<4xf32>
%302 = vector.fma %301, %113, %298 : vector<4xf32>
%303 = vector.extract_strided_slice %86 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%304 = vector.extract %303[0] : vector<1xf32>
%305 = splat %304 : vector<4xf32>
%306 = vector.fma %305, %114, %302 : vector<4xf32>
%307 = vector.extract_strided_slice %87 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%308 = vector.extract %307[0] : vector<1xf32>
%309 = splat %308 : vector<4xf32>
%310 = vector.fma %309, %107, %74#6 : vector<4xf32>
%311 = vector.extract_strided_slice %87 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%312 = vector.extract %311[0] : vector<1xf32>
%313 = splat %312 : vector<4xf32>
%314 = vector.fma %313, %108, %310 : vector<4xf32>
%315 = vector.extract_strided_slice %87 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%316 = vector.extract %315[0] : vector<1xf32>
%317 = splat %316 : vector<4xf32>
%318 = vector.fma %317, %109, %314 : vector<4xf32>
%319 = vector.extract_strided_slice %87 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%320 = vector.extract %319[0] : vector<1xf32>
%321 = splat %320 : vector<4xf32>
%322 = vector.fma %321, %110, %318 : vector<4xf32>
%323 = vector.extract_strided_slice %88 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%324 = vector.extract %323[0] : vector<1xf32>
%325 = splat %324 : vector<4xf32>
%326 = vector.fma %325, %111, %322 : vector<4xf32>
%327 = vector.extract_strided_slice %88 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%328 = vector.extract %327[0] : vector<1xf32>
%329 = splat %328 : vector<4xf32>
%330 = vector.fma %329, %112, %326 : vector<4xf32>
%331 = vector.extract_strided_slice %88 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%332 = vector.extract %331[0] : vector<1xf32>
%333 = splat %332 : vector<4xf32>
%334 = vector.fma %333, %113, %330 : vector<4xf32>
%335 = vector.extract_strided_slice %88 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%336 = vector.extract %335[0] : vector<1xf32>
%337 = splat %336 : vector<4xf32>
%338 = vector.fma %337, %114, %334 : vector<4xf32>
%339 = vector.extract_strided_slice %89 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%340 = vector.extract %339[0] : vector<1xf32>
%341 = splat %340 : vector<4xf32>
%342 = vector.fma %341, %107, %74#7 : vector<4xf32>
%343 = vector.extract_strided_slice %89 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%344 = vector.extract %343[0] : vector<1xf32>
%345 = splat %344 : vector<4xf32>
%346 = vector.fma %345, %108, %342 : vector<4xf32>
%347 = vector.extract_strided_slice %89 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%348 = vector.extract %347[0] : vector<1xf32>
%349 = splat %348 : vector<4xf32>
%350 = vector.fma %349, %109, %346 : vector<4xf32>
%351 = vector.extract_strided_slice %89 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%352 = vector.extract %351[0] : vector<1xf32>
%353 = splat %352 : vector<4xf32>
%354 = vector.fma %353, %110, %350 : vector<4xf32>
%355 = vector.extract_strided_slice %90 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%356 = vector.extract %355[0] : vector<1xf32>
%357 = splat %356 : vector<4xf32>
%358 = vector.fma %357, %111, %354 : vector<4xf32>
%359 = vector.extract_strided_slice %90 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%360 = vector.extract %359[0] : vector<1xf32>
%361 = splat %360 : vector<4xf32>
%362 = vector.fma %361, %112, %358 : vector<4xf32>
%363 = vector.extract_strided_slice %90 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%364 = vector.extract %363[0] : vector<1xf32>
%365 = splat %364 : vector<4xf32>
%366 = vector.fma %365, %113, %362 : vector<4xf32>
%367 = vector.extract_strided_slice %90 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%368 = vector.extract %367[0] : vector<1xf32>
%369 = splat %368 : vector<4xf32>
%370 = vector.fma %369, %114, %366 : vector<4xf32>
%371 = vector.extract_strided_slice %91 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%372 = vector.extract %371[0] : vector<1xf32>
%373 = splat %372 : vector<4xf32>
%374 = vector.fma %373, %107, %74#8 : vector<4xf32>
%375 = vector.extract_strided_slice %91 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%376 = vector.extract %375[0] : vector<1xf32>
%377 = splat %376 : vector<4xf32>
%378 = vector.fma %377, %108, %374 : vector<4xf32>
%379 = vector.extract_strided_slice %91 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%380 = vector.extract %379[0] : vector<1xf32>
%381 = splat %380 : vector<4xf32>
%382 = vector.fma %381, %109, %378 : vector<4xf32>
%383 = vector.extract_strided_slice %91 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%384 = vector.extract %383[0] : vector<1xf32>
%385 = splat %384 : vector<4xf32>
%386 = vector.fma %385, %110, %382 : vector<4xf32>
%387 = vector.extract_strided_slice %92 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%388 = vector.extract %387[0] : vector<1xf32>
%389 = splat %388 : vector<4xf32>
%390 = vector.fma %389, %111, %386 : vector<4xf32>
%391 = vector.extract_strided_slice %92 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%392 = vector.extract %391[0] : vector<1xf32>
%393 = splat %392 : vector<4xf32>
%394 = vector.fma %393, %112, %390 : vector<4xf32>
%395 = vector.extract_strided_slice %92 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%396 = vector.extract %395[0] : vector<1xf32>
%397 = splat %396 : vector<4xf32>
%398 = vector.fma %397, %113, %394 : vector<4xf32>
%399 = vector.extract_strided_slice %92 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%400 = vector.extract %399[0] : vector<1xf32>
%401 = splat %400 : vector<4xf32>
%402 = vector.fma %401, %114, %398 : vector<4xf32>
%403 = vector.extract_strided_slice %93 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%404 = vector.extract %403[0] : vector<1xf32>
%405 = splat %404 : vector<4xf32>
%406 = vector.fma %405, %107, %74#9 : vector<4xf32>
%407 = vector.extract_strided_slice %93 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%408 = vector.extract %407[0] : vector<1xf32>
%409 = splat %408 : vector<4xf32>
%410 = vector.fma %409, %108, %406 : vector<4xf32>
%411 = vector.extract_strided_slice %93 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%412 = vector.extract %411[0] : vector<1xf32>
%413 = splat %412 : vector<4xf32>
%414 = vector.fma %413, %109, %410 : vector<4xf32>
%415 = vector.extract_strided_slice %93 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%416 = vector.extract %415[0] : vector<1xf32>
%417 = splat %416 : vector<4xf32>
%418 = vector.fma %417, %110, %414 : vector<4xf32>
%419 = vector.extract_strided_slice %94 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%420 = vector.extract %419[0] : vector<1xf32>
%421 = splat %420 : vector<4xf32>
%422 = vector.fma %421, %111, %418 : vector<4xf32>
%423 = vector.extract_strided_slice %94 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%424 = vector.extract %423[0] : vector<1xf32>
%425 = splat %424 : vector<4xf32>
%426 = vector.fma %425, %112, %422 : vector<4xf32>
%427 = vector.extract_strided_slice %94 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%428 = vector.extract %427[0] : vector<1xf32>
%429 = splat %428 : vector<4xf32>
%430 = vector.fma %429, %113, %426 : vector<4xf32>
%431 = vector.extract_strided_slice %94 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%432 = vector.extract %431[0] : vector<1xf32>
%433 = splat %432 : vector<4xf32>
%434 = vector.fma %433, %114, %430 : vector<4xf32>
%435 = vector.extract_strided_slice %95 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%436 = vector.extract %435[0] : vector<1xf32>
%437 = splat %436 : vector<4xf32>
%438 = vector.fma %437, %107, %74#10 : vector<4xf32>
%439 = vector.extract_strided_slice %95 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%440 = vector.extract %439[0] : vector<1xf32>
%441 = splat %440 : vector<4xf32>
%442 = vector.fma %441, %108, %438 : vector<4xf32>
%443 = vector.extract_strided_slice %95 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%444 = vector.extract %443[0] : vector<1xf32>
%445 = splat %444 : vector<4xf32>
%446 = vector.fma %445, %109, %442 : vector<4xf32>
%447 = vector.extract_strided_slice %95 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%448 = vector.extract %447[0] : vector<1xf32>
%449 = splat %448 : vector<4xf32>
%450 = vector.fma %449, %110, %446 : vector<4xf32>
%451 = vector.extract_strided_slice %96 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%452 = vector.extract %451[0] : vector<1xf32>
%453 = splat %452 : vector<4xf32>
%454 = vector.fma %453, %111, %450 : vector<4xf32>
%455 = vector.extract_strided_slice %96 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%456 = vector.extract %455[0] : vector<1xf32>
%457 = splat %456 : vector<4xf32>
%458 = vector.fma %457, %112, %454 : vector<4xf32>
%459 = vector.extract_strided_slice %96 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%460 = vector.extract %459[0] : vector<1xf32>
%461 = splat %460 : vector<4xf32>
%462 = vector.fma %461, %113, %458 : vector<4xf32>
%463 = vector.extract_strided_slice %96 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%464 = vector.extract %463[0] : vector<1xf32>
%465 = splat %464 : vector<4xf32>
%466 = vector.fma %465, %114, %462 : vector<4xf32>
%467 = vector.extract_strided_slice %97 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%468 = vector.extract %467[0] : vector<1xf32>
%469 = splat %468 : vector<4xf32>
%470 = vector.fma %469, %107, %74#11 : vector<4xf32>
%471 = vector.extract_strided_slice %97 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%472 = vector.extract %471[0] : vector<1xf32>
%473 = splat %472 : vector<4xf32>
%474 = vector.fma %473, %108, %470 : vector<4xf32>
%475 = vector.extract_strided_slice %97 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%476 = vector.extract %475[0] : vector<1xf32>
%477 = splat %476 : vector<4xf32>
%478 = vector.fma %477, %109, %474 : vector<4xf32>
%479 = vector.extract_strided_slice %97 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%480 = vector.extract %479[0] : vector<1xf32>
%481 = splat %480 : vector<4xf32>
%482 = vector.fma %481, %110, %478 : vector<4xf32>
%483 = vector.extract_strided_slice %98 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%484 = vector.extract %483[0] : vector<1xf32>
%485 = splat %484 : vector<4xf32>
%486 = vector.fma %485, %111, %482 : vector<4xf32>
%487 = vector.extract_strided_slice %98 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%488 = vector.extract %487[0] : vector<1xf32>
%489 = splat %488 : vector<4xf32>
%490 = vector.fma %489, %112, %486 : vector<4xf32>
%491 = vector.extract_strided_slice %98 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%492 = vector.extract %491[0] : vector<1xf32>
%493 = splat %492 : vector<4xf32>
%494 = vector.fma %493, %113, %490 : vector<4xf32>
%495 = vector.extract_strided_slice %98 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%496 = vector.extract %495[0] : vector<1xf32>
%497 = splat %496 : vector<4xf32>
%498 = vector.fma %497, %114, %494 : vector<4xf32>
%499 = vector.extract_strided_slice %99 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%500 = vector.extract %499[0] : vector<1xf32>
%501 = splat %500 : vector<4xf32>
%502 = vector.fma %501, %107, %74#12 : vector<4xf32>
%503 = vector.extract_strided_slice %99 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%504 = vector.extract %503[0] : vector<1xf32>
%505 = splat %504 : vector<4xf32>
%506 = vector.fma %505, %108, %502 : vector<4xf32>
%507 = vector.extract_strided_slice %99 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%508 = vector.extract %507[0] : vector<1xf32>
%509 = splat %508 : vector<4xf32>
%510 = vector.fma %509, %109, %506 : vector<4xf32>
%511 = vector.extract_strided_slice %99 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%512 = vector.extract %511[0] : vector<1xf32>
%513 = splat %512 : vector<4xf32>
%514 = vector.fma %513, %110, %510 : vector<4xf32>
%515 = vector.extract_strided_slice %100 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%516 = vector.extract %515[0] : vector<1xf32>
%517 = splat %516 : vector<4xf32>
%518 = vector.fma %517, %111, %514 : vector<4xf32>
%519 = vector.extract_strided_slice %100 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%520 = vector.extract %519[0] : vector<1xf32>
%521 = splat %520 : vector<4xf32>
%522 = vector.fma %521, %112, %518 : vector<4xf32>
%523 = vector.extract_strided_slice %100 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%524 = vector.extract %523[0] : vector<1xf32>
%525 = splat %524 : vector<4xf32>
%526 = vector.fma %525, %113, %522 : vector<4xf32>
%527 = vector.extract_strided_slice %100 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%528 = vector.extract %527[0] : vector<1xf32>
%529 = splat %528 : vector<4xf32>
%530 = vector.fma %529, %114, %526 : vector<4xf32>
%531 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%532 = vector.extract %531[0] : vector<1xf32>
%533 = splat %532 : vector<4xf32>
%534 = vector.fma %533, %107, %74#13 : vector<4xf32>
%535 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%536 = vector.extract %535[0] : vector<1xf32>
%537 = splat %536 : vector<4xf32>
%538 = vector.fma %537, %108, %534 : vector<4xf32>
%539 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%540 = vector.extract %539[0] : vector<1xf32>
%541 = splat %540 : vector<4xf32>
%542 = vector.fma %541, %109, %538 : vector<4xf32>
%543 = vector.extract_strided_slice %101 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%544 = vector.extract %543[0] : vector<1xf32>
%545 = splat %544 : vector<4xf32>
%546 = vector.fma %545, %110, %542 : vector<4xf32>
%547 = vector.extract_strided_slice %102 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%548 = vector.extract %547[0] : vector<1xf32>
%549 = splat %548 : vector<4xf32>
%550 = vector.fma %549, %111, %546 : vector<4xf32>
%551 = vector.extract_strided_slice %102 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%552 = vector.extract %551[0] : vector<1xf32>
%553 = splat %552 : vector<4xf32>
%554 = vector.fma %553, %112, %550 : vector<4xf32>
%555 = vector.extract_strided_slice %102 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%556 = vector.extract %555[0] : vector<1xf32>
%557 = splat %556 : vector<4xf32>
%558 = vector.fma %557, %113, %554 : vector<4xf32>
%559 = vector.extract_strided_slice %102 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%560 = vector.extract %559[0] : vector<1xf32>
%561 = splat %560 : vector<4xf32>
%562 = vector.fma %561, %114, %558 : vector<4xf32>
%563 = vector.extract_strided_slice %103 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%564 = vector.extract %563[0] : vector<1xf32>
%565 = splat %564 : vector<4xf32>
%566 = vector.fma %565, %107, %74#14 : vector<4xf32>
%567 = vector.extract_strided_slice %103 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%568 = vector.extract %567[0] : vector<1xf32>
%569 = splat %568 : vector<4xf32>
%570 = vector.fma %569, %108, %566 : vector<4xf32>
%571 = vector.extract_strided_slice %103 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%572 = vector.extract %571[0] : vector<1xf32>
%573 = splat %572 : vector<4xf32>
%574 = vector.fma %573, %109, %570 : vector<4xf32>
%575 = vector.extract_strided_slice %103 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%576 = vector.extract %575[0] : vector<1xf32>
%577 = splat %576 : vector<4xf32>
%578 = vector.fma %577, %110, %574 : vector<4xf32>
%579 = vector.extract_strided_slice %104 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%580 = vector.extract %579[0] : vector<1xf32>
%581 = splat %580 : vector<4xf32>
%582 = vector.fma %581, %111, %578 : vector<4xf32>
%583 = vector.extract_strided_slice %104 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%584 = vector.extract %583[0] : vector<1xf32>
%585 = splat %584 : vector<4xf32>
%586 = vector.fma %585, %112, %582 : vector<4xf32>
%587 = vector.extract_strided_slice %104 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%588 = vector.extract %587[0] : vector<1xf32>
%589 = splat %588 : vector<4xf32>
%590 = vector.fma %589, %113, %586 : vector<4xf32>
%591 = vector.extract_strided_slice %104 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%592 = vector.extract %591[0] : vector<1xf32>
%593 = splat %592 : vector<4xf32>
%594 = vector.fma %593, %114, %590 : vector<4xf32>
%595 = vector.extract_strided_slice %105 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%596 = vector.extract %595[0] : vector<1xf32>
%597 = splat %596 : vector<4xf32>
%598 = vector.fma %597, %107, %74#15 : vector<4xf32>
%599 = vector.extract_strided_slice %105 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%600 = vector.extract %599[0] : vector<1xf32>
%601 = splat %600 : vector<4xf32>
%602 = vector.fma %601, %108, %598 : vector<4xf32>
%603 = vector.extract_strided_slice %105 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%604 = vector.extract %603[0] : vector<1xf32>
%605 = splat %604 : vector<4xf32>
%606 = vector.fma %605, %109, %602 : vector<4xf32>
%607 = vector.extract_strided_slice %105 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%608 = vector.extract %607[0] : vector<1xf32>
%609 = splat %608 : vector<4xf32>
%610 = vector.fma %609, %110, %606 : vector<4xf32>
%611 = vector.extract_strided_slice %106 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%612 = vector.extract %611[0] : vector<1xf32>
%613 = splat %612 : vector<4xf32>
%614 = vector.fma %613, %111, %610 : vector<4xf32>
%615 = vector.extract_strided_slice %106 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%616 = vector.extract %615[0] : vector<1xf32>
%617 = splat %616 : vector<4xf32>
%618 = vector.fma %617, %112, %614 : vector<4xf32>
%619 = vector.extract_strided_slice %106 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%620 = vector.extract %619[0] : vector<1xf32>
%621 = splat %620 : vector<4xf32>
%622 = vector.fma %621, %113, %618 : vector<4xf32>
%623 = vector.extract_strided_slice %106 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%624 = vector.extract %623[0] : vector<1xf32>
%625 = splat %624 : vector<4xf32>
%626 = vector.fma %625, %114, %622 : vector<4xf32>
vector.transfer_write %626, %68[%c15, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %594, %68[%c14, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %562, %68[%c13, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %530, %68[%c12, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %498, %68[%c11, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %466, %68[%c10, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %434, %68[%c9, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %402, %68[%c8, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %370, %68[%c7, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %338, %68[%c6, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %306, %68[%c5, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %274, %68[%c4, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %242, %68[%c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %210, %68[%c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %178, %68[%c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
vector.transfer_write %146, %68[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
memref.global "private" @__shared_memory___0 : memref<8x128xf32, 3>
memref.global "private" @__shared_memory__ : memref<64x8xf32, 3>
func @_large_aligned_dispatch_0() {
%c4 = constant 4 : index
%c-1 = constant -1 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c-128 = constant -128 : index
%c-8 = constant -8 : index
%c2 = constant 2 : index
%c64 = constant 64 : index
%c16 = constant 16 : index
%cst = constant dense<0.000000e+00> : vector<4xf32>
%c128 = constant 128 : index
%c1016 = constant 1016 : index
%c8 = constant 8 : index
%cst_0 = constant 0.000000e+00 : f32
%c2048 = constant 2048 : index
%c512 = constant 512 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c5 = constant 5 : index
%c6 = constant 6 : index
%c7 = constant 7 : index
%c9 = constant 9 : index
%c10 = constant 10 : index
%c11 = constant 11 : index
%c12 = constant 12 : index
%c13 = constant 13 : index
%c14 = constant 14 : index
%c15 = constant 15 : index
%0 = "gpu.thread_id"() {dimension = "x"} : () -> index
%1 = "gpu.thread_id"() {dimension = "y"} : () -> index
%2 = "gpu.thread_id"() {dimension = "z"} : () -> index
%3 = memref.get_global @__shared_memory___0 : memref<8x128xf32, 3>
%4 = memref.get_global @__shared_memory__ : memref<64x8xf32, 3>
%5 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<2048x1024xf32>
%6 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<1024x512xf32>
%7 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<2048x512xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = muli %workgroup_id_y, %c64 : index
%9 = muli %workgroup_count_y, %c64 : index
%10 = muli %workgroup_id_x, %c128 : index
%11 = muli %workgroup_count_x, %c128 : index
%12 = muli %1, %c16 : index
%13 = muli %0, %c4 : index
%14 = memref.subview %4[0, 0] [64, 8] [1, 1] : memref<64x8xf32, 3> to memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
%15 = memref.subview %3[0, 0] [8, 128] [1, 1] : memref<8x128xf32, 3> to memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
%16 = muli %2, %c64 : index
%17 = addi %12, %16 : index
%18 = cmpi slt, %0, %c0 : index
%19 = subi %c-1, %0 : index
%20 = select %18, %19, %0 : index
%21 = divi_signed %20, %c2 : index
%22 = subi %c-1, %21 : index
%23 = select %18, %22, %21 : index
%24 = addi %17, %23 : index
%25 = muli %23, %c-8 : index
%26 = addi %13, %25 : index
%27 = muli %2, %c4 : index
%28 = addi %1, %27 : index
%29 = divi_signed %20, %c32 : index
%30 = subi %c-1, %29 : index
%31 = select %18, %30, %29 : index
%32 = addi %28, %31 : index
%33 = muli %31, %c-128 : index
%34 = addi %13, %33 : index
%35 = addi %32, %c4 : index
%36 = memref.subview %14[%12, 0] [16, 8] [1, 1] : memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3> to memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>
%37 = memref.subview %15[0, %13] [8, 4] [1, 1] : memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3> to memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>
scf.for %arg0 = %8 to %c2048 step %9 {
%38 = memref.subview %5[%arg0, 0] [64, 1024] [1, 1] : memref<2048x1024xf32> to memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
scf.for %arg1 = %10 to %c512 step %11 {
%39 = memref.subview %6[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32> to memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%40 = memref.subview %7[%arg0, %arg1] [64, 128] [1, 1] : memref<2048x512xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%41 = memref.subview %40[%12, %13] [16, 4] [1, 1] : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%42 = memref.subview %38[0, 0] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%43 = memref.subview %39[0, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%44 = vector.transfer_read %42[%24, %26], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%45 = vector.transfer_read %43[%32, %34], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%46 = vector.transfer_read %43[%35, %34], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%47:19 = scf.for %arg2 = %c0 to %c1016 step %c8 iter_args(%arg3 = %cst, %arg4 = %cst, %arg5 = %cst, %arg6 = %cst, %arg7 = %cst, %arg8 = %cst, %arg9 = %cst, %arg10 = %cst, %arg11 = %cst, %arg12 = %cst, %arg13 = %cst, %arg14 = %cst, %arg15 = %cst, %arg16 = %cst, %arg17 = %cst, %arg18 = %cst, %arg19 = %44, %arg20 = %45, %arg21 = %46) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
gpu.barrier
vector.transfer_write %arg19, %14[%24, %26] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %arg20, %15[%32, %34] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %arg21, %15[%35, %34] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%600 = vector.transfer_read %36[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%601 = vector.transfer_read %36[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%602 = vector.transfer_read %36[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%603 = vector.transfer_read %36[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%604 = vector.transfer_read %36[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%605 = vector.transfer_read %36[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%606 = vector.transfer_read %36[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%607 = vector.transfer_read %36[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%608 = vector.transfer_read %36[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%609 = vector.transfer_read %36[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%610 = vector.transfer_read %36[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%611 = vector.transfer_read %36[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%612 = vector.transfer_read %36[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%613 = vector.transfer_read %36[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%614 = vector.transfer_read %36[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%615 = vector.transfer_read %36[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%616 = vector.transfer_read %36[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%617 = vector.transfer_read %36[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%618 = vector.transfer_read %36[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%619 = vector.transfer_read %36[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%620 = vector.transfer_read %36[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%621 = vector.transfer_read %36[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%622 = vector.transfer_read %36[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%623 = vector.transfer_read %36[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%624 = vector.transfer_read %36[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%625 = vector.transfer_read %36[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%626 = vector.transfer_read %36[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%627 = vector.transfer_read %36[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%628 = vector.transfer_read %36[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%629 = vector.transfer_read %36[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%630 = vector.transfer_read %36[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%631 = vector.transfer_read %36[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%632 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%633 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%634 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%635 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%636 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%637 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%638 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%639 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%640 = vector.extract_strided_slice %600 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%641 = vector.extract %640[0] : vector<1xf32>
%642 = splat %641 : vector<4xf32>
%643 = vector.fma %642, %632, %arg3 : vector<4xf32>
%644 = vector.extract_strided_slice %600 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%645 = vector.extract %644[0] : vector<1xf32>
%646 = splat %645 : vector<4xf32>
%647 = vector.fma %646, %633, %643 : vector<4xf32>
%648 = vector.extract_strided_slice %600 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%649 = vector.extract %648[0] : vector<1xf32>
%650 = splat %649 : vector<4xf32>
%651 = vector.fma %650, %634, %647 : vector<4xf32>
%652 = vector.extract_strided_slice %600 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%653 = vector.extract %652[0] : vector<1xf32>
%654 = splat %653 : vector<4xf32>
%655 = vector.fma %654, %635, %651 : vector<4xf32>
%656 = vector.extract_strided_slice %601 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%657 = vector.extract %656[0] : vector<1xf32>
%658 = splat %657 : vector<4xf32>
%659 = vector.fma %658, %636, %655 : vector<4xf32>
%660 = vector.extract_strided_slice %601 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%661 = vector.extract %660[0] : vector<1xf32>
%662 = splat %661 : vector<4xf32>
%663 = vector.fma %662, %637, %659 : vector<4xf32>
%664 = vector.extract_strided_slice %601 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%665 = vector.extract %664[0] : vector<1xf32>
%666 = splat %665 : vector<4xf32>
%667 = vector.fma %666, %638, %663 : vector<4xf32>
%668 = vector.extract_strided_slice %601 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%669 = vector.extract %668[0] : vector<1xf32>
%670 = splat %669 : vector<4xf32>
%671 = vector.fma %670, %639, %667 : vector<4xf32>
%672 = vector.extract_strided_slice %602 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%673 = vector.extract %672[0] : vector<1xf32>
%674 = splat %673 : vector<4xf32>
%675 = vector.fma %674, %632, %arg4 : vector<4xf32>
%676 = vector.extract_strided_slice %602 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%677 = vector.extract %676[0] : vector<1xf32>
%678 = splat %677 : vector<4xf32>
%679 = vector.fma %678, %633, %675 : vector<4xf32>
%680 = vector.extract_strided_slice %602 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%681 = vector.extract %680[0] : vector<1xf32>
%682 = splat %681 : vector<4xf32>
%683 = vector.fma %682, %634, %679 : vector<4xf32>
%684 = vector.extract_strided_slice %602 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%685 = vector.extract %684[0] : vector<1xf32>
%686 = splat %685 : vector<4xf32>
%687 = vector.fma %686, %635, %683 : vector<4xf32>
%688 = vector.extract_strided_slice %603 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%689 = vector.extract %688[0] : vector<1xf32>
%690 = splat %689 : vector<4xf32>
%691 = vector.fma %690, %636, %687 : vector<4xf32>
%692 = vector.extract_strided_slice %603 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%693 = vector.extract %692[0] : vector<1xf32>
%694 = splat %693 : vector<4xf32>
%695 = vector.fma %694, %637, %691 : vector<4xf32>
%696 = vector.extract_strided_slice %603 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%697 = vector.extract %696[0] : vector<1xf32>
%698 = splat %697 : vector<4xf32>
%699 = vector.fma %698, %638, %695 : vector<4xf32>
%700 = vector.extract_strided_slice %603 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%701 = vector.extract %700[0] : vector<1xf32>
%702 = splat %701 : vector<4xf32>
%703 = vector.fma %702, %639, %699 : vector<4xf32>
%704 = vector.extract_strided_slice %604 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%705 = vector.extract %704[0] : vector<1xf32>
%706 = splat %705 : vector<4xf32>
%707 = vector.fma %706, %632, %arg5 : vector<4xf32>
%708 = vector.extract_strided_slice %604 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%709 = vector.extract %708[0] : vector<1xf32>
%710 = splat %709 : vector<4xf32>
%711 = vector.fma %710, %633, %707 : vector<4xf32>
%712 = vector.extract_strided_slice %604 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%713 = vector.extract %712[0] : vector<1xf32>
%714 = splat %713 : vector<4xf32>
%715 = vector.fma %714, %634, %711 : vector<4xf32>
%716 = vector.extract_strided_slice %604 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%717 = vector.extract %716[0] : vector<1xf32>
%718 = splat %717 : vector<4xf32>
%719 = vector.fma %718, %635, %715 : vector<4xf32>
%720 = vector.extract_strided_slice %605 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%721 = vector.extract %720[0] : vector<1xf32>
%722 = splat %721 : vector<4xf32>
%723 = vector.fma %722, %636, %719 : vector<4xf32>
%724 = vector.extract_strided_slice %605 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%725 = vector.extract %724[0] : vector<1xf32>
%726 = splat %725 : vector<4xf32>
%727 = vector.fma %726, %637, %723 : vector<4xf32>
%728 = vector.extract_strided_slice %605 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%729 = vector.extract %728[0] : vector<1xf32>
%730 = splat %729 : vector<4xf32>
%731 = vector.fma %730, %638, %727 : vector<4xf32>
%732 = vector.extract_strided_slice %605 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%733 = vector.extract %732[0] : vector<1xf32>
%734 = splat %733 : vector<4xf32>
%735 = vector.fma %734, %639, %731 : vector<4xf32>
%736 = vector.extract_strided_slice %606 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%737 = vector.extract %736[0] : vector<1xf32>
%738 = splat %737 : vector<4xf32>
%739 = vector.fma %738, %632, %arg6 : vector<4xf32>
%740 = vector.extract_strided_slice %606 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%741 = vector.extract %740[0] : vector<1xf32>
%742 = splat %741 : vector<4xf32>
%743 = vector.fma %742, %633, %739 : vector<4xf32>
%744 = vector.extract_strided_slice %606 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%745 = vector.extract %744[0] : vector<1xf32>
%746 = splat %745 : vector<4xf32>
%747 = vector.fma %746, %634, %743 : vector<4xf32>
%748 = vector.extract_strided_slice %606 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%749 = vector.extract %748[0] : vector<1xf32>
%750 = splat %749 : vector<4xf32>
%751 = vector.fma %750, %635, %747 : vector<4xf32>
%752 = vector.extract_strided_slice %607 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%753 = vector.extract %752[0] : vector<1xf32>
%754 = splat %753 : vector<4xf32>
%755 = vector.fma %754, %636, %751 : vector<4xf32>
%756 = vector.extract_strided_slice %607 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%757 = vector.extract %756[0] : vector<1xf32>
%758 = splat %757 : vector<4xf32>
%759 = vector.fma %758, %637, %755 : vector<4xf32>
%760 = vector.extract_strided_slice %607 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%761 = vector.extract %760[0] : vector<1xf32>
%762 = splat %761 : vector<4xf32>
%763 = vector.fma %762, %638, %759 : vector<4xf32>
%764 = vector.extract_strided_slice %607 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%765 = vector.extract %764[0] : vector<1xf32>
%766 = splat %765 : vector<4xf32>
%767 = vector.fma %766, %639, %763 : vector<4xf32>
%768 = vector.extract_strided_slice %608 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%769 = vector.extract %768[0] : vector<1xf32>
%770 = splat %769 : vector<4xf32>
%771 = vector.fma %770, %632, %arg7 : vector<4xf32>
%772 = vector.extract_strided_slice %608 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%773 = vector.extract %772[0] : vector<1xf32>
%774 = splat %773 : vector<4xf32>
%775 = vector.fma %774, %633, %771 : vector<4xf32>
%776 = vector.extract_strided_slice %608 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%777 = vector.extract %776[0] : vector<1xf32>
%778 = splat %777 : vector<4xf32>
%779 = vector.fma %778, %634, %775 : vector<4xf32>
%780 = vector.extract_strided_slice %608 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%781 = vector.extract %780[0] : vector<1xf32>
%782 = splat %781 : vector<4xf32>
%783 = vector.fma %782, %635, %779 : vector<4xf32>
%784 = vector.extract_strided_slice %609 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%785 = vector.extract %784[0] : vector<1xf32>
%786 = splat %785 : vector<4xf32>
%787 = vector.fma %786, %636, %783 : vector<4xf32>
%788 = vector.extract_strided_slice %609 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%789 = vector.extract %788[0] : vector<1xf32>
%790 = splat %789 : vector<4xf32>
%791 = vector.fma %790, %637, %787 : vector<4xf32>
%792 = vector.extract_strided_slice %609 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%793 = vector.extract %792[0] : vector<1xf32>
%794 = splat %793 : vector<4xf32>
%795 = vector.fma %794, %638, %791 : vector<4xf32>
%796 = vector.extract_strided_slice %609 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%797 = vector.extract %796[0] : vector<1xf32>
%798 = splat %797 : vector<4xf32>
%799 = vector.fma %798, %639, %795 : vector<4xf32>
%800 = vector.extract_strided_slice %610 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%801 = vector.extract %800[0] : vector<1xf32>
%802 = splat %801 : vector<4xf32>
%803 = vector.fma %802, %632, %arg8 : vector<4xf32>
%804 = vector.extract_strided_slice %610 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%805 = vector.extract %804[0] : vector<1xf32>
%806 = splat %805 : vector<4xf32>
%807 = vector.fma %806, %633, %803 : vector<4xf32>
%808 = vector.extract_strided_slice %610 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%809 = vector.extract %808[0] : vector<1xf32>
%810 = splat %809 : vector<4xf32>
%811 = vector.fma %810, %634, %807 : vector<4xf32>
%812 = vector.extract_strided_slice %610 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%813 = vector.extract %812[0] : vector<1xf32>
%814 = splat %813 : vector<4xf32>
%815 = vector.fma %814, %635, %811 : vector<4xf32>
%816 = vector.extract_strided_slice %611 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%817 = vector.extract %816[0] : vector<1xf32>
%818 = splat %817 : vector<4xf32>
%819 = vector.fma %818, %636, %815 : vector<4xf32>
%820 = vector.extract_strided_slice %611 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%821 = vector.extract %820[0] : vector<1xf32>
%822 = splat %821 : vector<4xf32>
%823 = vector.fma %822, %637, %819 : vector<4xf32>
%824 = vector.extract_strided_slice %611 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%825 = vector.extract %824[0] : vector<1xf32>
%826 = splat %825 : vector<4xf32>
%827 = vector.fma %826, %638, %823 : vector<4xf32>
%828 = vector.extract_strided_slice %611 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%829 = vector.extract %828[0] : vector<1xf32>
%830 = splat %829 : vector<4xf32>
%831 = vector.fma %830, %639, %827 : vector<4xf32>
%832 = vector.extract_strided_slice %612 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%833 = vector.extract %832[0] : vector<1xf32>
%834 = splat %833 : vector<4xf32>
%835 = vector.fma %834, %632, %arg9 : vector<4xf32>
%836 = vector.extract_strided_slice %612 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%837 = vector.extract %836[0] : vector<1xf32>
%838 = splat %837 : vector<4xf32>
%839 = vector.fma %838, %633, %835 : vector<4xf32>
%840 = vector.extract_strided_slice %612 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%841 = vector.extract %840[0] : vector<1xf32>
%842 = splat %841 : vector<4xf32>
%843 = vector.fma %842, %634, %839 : vector<4xf32>
%844 = vector.extract_strided_slice %612 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%845 = vector.extract %844[0] : vector<1xf32>
%846 = splat %845 : vector<4xf32>
%847 = vector.fma %846, %635, %843 : vector<4xf32>
%848 = vector.extract_strided_slice %613 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%849 = vector.extract %848[0] : vector<1xf32>
%850 = splat %849 : vector<4xf32>
%851 = vector.fma %850, %636, %847 : vector<4xf32>
%852 = vector.extract_strided_slice %613 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%853 = vector.extract %852[0] : vector<1xf32>
%854 = splat %853 : vector<4xf32>
%855 = vector.fma %854, %637, %851 : vector<4xf32>
%856 = vector.extract_strided_slice %613 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%857 = vector.extract %856[0] : vector<1xf32>
%858 = splat %857 : vector<4xf32>
%859 = vector.fma %858, %638, %855 : vector<4xf32>
%860 = vector.extract_strided_slice %613 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%861 = vector.extract %860[0] : vector<1xf32>
%862 = splat %861 : vector<4xf32>
%863 = vector.fma %862, %639, %859 : vector<4xf32>
%864 = vector.extract_strided_slice %614 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%865 = vector.extract %864[0] : vector<1xf32>
%866 = splat %865 : vector<4xf32>
%867 = vector.fma %866, %632, %arg10 : vector<4xf32>
%868 = vector.extract_strided_slice %614 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%869 = vector.extract %868[0] : vector<1xf32>
%870 = splat %869 : vector<4xf32>
%871 = vector.fma %870, %633, %867 : vector<4xf32>
%872 = vector.extract_strided_slice %614 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%873 = vector.extract %872[0] : vector<1xf32>
%874 = splat %873 : vector<4xf32>
%875 = vector.fma %874, %634, %871 : vector<4xf32>
%876 = vector.extract_strided_slice %614 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%877 = vector.extract %876[0] : vector<1xf32>
%878 = splat %877 : vector<4xf32>
%879 = vector.fma %878, %635, %875 : vector<4xf32>
%880 = vector.extract_strided_slice %615 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%881 = vector.extract %880[0] : vector<1xf32>
%882 = splat %881 : vector<4xf32>
%883 = vector.fma %882, %636, %879 : vector<4xf32>
%884 = vector.extract_strided_slice %615 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%885 = vector.extract %884[0] : vector<1xf32>
%886 = splat %885 : vector<4xf32>
%887 = vector.fma %886, %637, %883 : vector<4xf32>
%888 = vector.extract_strided_slice %615 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%889 = vector.extract %888[0] : vector<1xf32>
%890 = splat %889 : vector<4xf32>
%891 = vector.fma %890, %638, %887 : vector<4xf32>
%892 = vector.extract_strided_slice %615 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%893 = vector.extract %892[0] : vector<1xf32>
%894 = splat %893 : vector<4xf32>
%895 = vector.fma %894, %639, %891 : vector<4xf32>
%896 = vector.extract_strided_slice %616 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%897 = vector.extract %896[0] : vector<1xf32>
%898 = splat %897 : vector<4xf32>
%899 = vector.fma %898, %632, %arg11 : vector<4xf32>
%900 = vector.extract_strided_slice %616 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%901 = vector.extract %900[0] : vector<1xf32>
%902 = splat %901 : vector<4xf32>
%903 = vector.fma %902, %633, %899 : vector<4xf32>
%904 = vector.extract_strided_slice %616 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%905 = vector.extract %904[0] : vector<1xf32>
%906 = splat %905 : vector<4xf32>
%907 = vector.fma %906, %634, %903 : vector<4xf32>
%908 = vector.extract_strided_slice %616 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%909 = vector.extract %908[0] : vector<1xf32>
%910 = splat %909 : vector<4xf32>
%911 = vector.fma %910, %635, %907 : vector<4xf32>
%912 = vector.extract_strided_slice %617 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%913 = vector.extract %912[0] : vector<1xf32>
%914 = splat %913 : vector<4xf32>
%915 = vector.fma %914, %636, %911 : vector<4xf32>
%916 = vector.extract_strided_slice %617 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%917 = vector.extract %916[0] : vector<1xf32>
%918 = splat %917 : vector<4xf32>
%919 = vector.fma %918, %637, %915 : vector<4xf32>
%920 = vector.extract_strided_slice %617 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%921 = vector.extract %920[0] : vector<1xf32>
%922 = splat %921 : vector<4xf32>
%923 = vector.fma %922, %638, %919 : vector<4xf32>
%924 = vector.extract_strided_slice %617 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%925 = vector.extract %924[0] : vector<1xf32>
%926 = splat %925 : vector<4xf32>
%927 = vector.fma %926, %639, %923 : vector<4xf32>
%928 = vector.extract_strided_slice %618 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%929 = vector.extract %928[0] : vector<1xf32>
%930 = splat %929 : vector<4xf32>
%931 = vector.fma %930, %632, %arg12 : vector<4xf32>
%932 = vector.extract_strided_slice %618 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%933 = vector.extract %932[0] : vector<1xf32>
%934 = splat %933 : vector<4xf32>
%935 = vector.fma %934, %633, %931 : vector<4xf32>
%936 = vector.extract_strided_slice %618 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%937 = vector.extract %936[0] : vector<1xf32>
%938 = splat %937 : vector<4xf32>
%939 = vector.fma %938, %634, %935 : vector<4xf32>
%940 = vector.extract_strided_slice %618 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%941 = vector.extract %940[0] : vector<1xf32>
%942 = splat %941 : vector<4xf32>
%943 = vector.fma %942, %635, %939 : vector<4xf32>
%944 = vector.extract_strided_slice %619 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%945 = vector.extract %944[0] : vector<1xf32>
%946 = splat %945 : vector<4xf32>
%947 = vector.fma %946, %636, %943 : vector<4xf32>
%948 = vector.extract_strided_slice %619 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%949 = vector.extract %948[0] : vector<1xf32>
%950 = splat %949 : vector<4xf32>
%951 = vector.fma %950, %637, %947 : vector<4xf32>
%952 = vector.extract_strided_slice %619 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%953 = vector.extract %952[0] : vector<1xf32>
%954 = splat %953 : vector<4xf32>
%955 = vector.fma %954, %638, %951 : vector<4xf32>
%956 = vector.extract_strided_slice %619 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%957 = vector.extract %956[0] : vector<1xf32>
%958 = splat %957 : vector<4xf32>
%959 = vector.fma %958, %639, %955 : vector<4xf32>
%960 = vector.extract_strided_slice %620 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%961 = vector.extract %960[0] : vector<1xf32>
%962 = splat %961 : vector<4xf32>
%963 = vector.fma %962, %632, %arg13 : vector<4xf32>
%964 = vector.extract_strided_slice %620 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%965 = vector.extract %964[0] : vector<1xf32>
%966 = splat %965 : vector<4xf32>
%967 = vector.fma %966, %633, %963 : vector<4xf32>
%968 = vector.extract_strided_slice %620 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%969 = vector.extract %968[0] : vector<1xf32>
%970 = splat %969 : vector<4xf32>
%971 = vector.fma %970, %634, %967 : vector<4xf32>
%972 = vector.extract_strided_slice %620 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%973 = vector.extract %972[0] : vector<1xf32>
%974 = splat %973 : vector<4xf32>
%975 = vector.fma %974, %635, %971 : vector<4xf32>
%976 = vector.extract_strided_slice %621 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%977 = vector.extract %976[0] : vector<1xf32>
%978 = splat %977 : vector<4xf32>
%979 = vector.fma %978, %636, %975 : vector<4xf32>
%980 = vector.extract_strided_slice %621 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%981 = vector.extract %980[0] : vector<1xf32>
%982 = splat %981 : vector<4xf32>
%983 = vector.fma %982, %637, %979 : vector<4xf32>
%984 = vector.extract_strided_slice %621 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%985 = vector.extract %984[0] : vector<1xf32>
%986 = splat %985 : vector<4xf32>
%987 = vector.fma %986, %638, %983 : vector<4xf32>
%988 = vector.extract_strided_slice %621 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%989 = vector.extract %988[0] : vector<1xf32>
%990 = splat %989 : vector<4xf32>
%991 = vector.fma %990, %639, %987 : vector<4xf32>
%992 = vector.extract_strided_slice %622 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%993 = vector.extract %992[0] : vector<1xf32>
%994 = splat %993 : vector<4xf32>
%995 = vector.fma %994, %632, %arg14 : vector<4xf32>
%996 = vector.extract_strided_slice %622 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%997 = vector.extract %996[0] : vector<1xf32>
%998 = splat %997 : vector<4xf32>
%999 = vector.fma %998, %633, %995 : vector<4xf32>
%1000 = vector.extract_strided_slice %622 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1001 = vector.extract %1000[0] : vector<1xf32>
%1002 = splat %1001 : vector<4xf32>
%1003 = vector.fma %1002, %634, %999 : vector<4xf32>
%1004 = vector.extract_strided_slice %622 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1005 = vector.extract %1004[0] : vector<1xf32>
%1006 = splat %1005 : vector<4xf32>
%1007 = vector.fma %1006, %635, %1003 : vector<4xf32>
%1008 = vector.extract_strided_slice %623 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1009 = vector.extract %1008[0] : vector<1xf32>
%1010 = splat %1009 : vector<4xf32>
%1011 = vector.fma %1010, %636, %1007 : vector<4xf32>
%1012 = vector.extract_strided_slice %623 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1013 = vector.extract %1012[0] : vector<1xf32>
%1014 = splat %1013 : vector<4xf32>
%1015 = vector.fma %1014, %637, %1011 : vector<4xf32>
%1016 = vector.extract_strided_slice %623 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1017 = vector.extract %1016[0] : vector<1xf32>
%1018 = splat %1017 : vector<4xf32>
%1019 = vector.fma %1018, %638, %1015 : vector<4xf32>
%1020 = vector.extract_strided_slice %623 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1021 = vector.extract %1020[0] : vector<1xf32>
%1022 = splat %1021 : vector<4xf32>
%1023 = vector.fma %1022, %639, %1019 : vector<4xf32>
%1024 = vector.extract_strided_slice %624 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1025 = vector.extract %1024[0] : vector<1xf32>
%1026 = splat %1025 : vector<4xf32>
%1027 = vector.fma %1026, %632, %arg15 : vector<4xf32>
%1028 = vector.extract_strided_slice %624 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1029 = vector.extract %1028[0] : vector<1xf32>
%1030 = splat %1029 : vector<4xf32>
%1031 = vector.fma %1030, %633, %1027 : vector<4xf32>
%1032 = vector.extract_strided_slice %624 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1033 = vector.extract %1032[0] : vector<1xf32>
%1034 = splat %1033 : vector<4xf32>
%1035 = vector.fma %1034, %634, %1031 : vector<4xf32>
%1036 = vector.extract_strided_slice %624 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1037 = vector.extract %1036[0] : vector<1xf32>
%1038 = splat %1037 : vector<4xf32>
%1039 = vector.fma %1038, %635, %1035 : vector<4xf32>
%1040 = vector.extract_strided_slice %625 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1041 = vector.extract %1040[0] : vector<1xf32>
%1042 = splat %1041 : vector<4xf32>
%1043 = vector.fma %1042, %636, %1039 : vector<4xf32>
%1044 = vector.extract_strided_slice %625 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1045 = vector.extract %1044[0] : vector<1xf32>
%1046 = splat %1045 : vector<4xf32>
%1047 = vector.fma %1046, %637, %1043 : vector<4xf32>
%1048 = vector.extract_strided_slice %625 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1049 = vector.extract %1048[0] : vector<1xf32>
%1050 = splat %1049 : vector<4xf32>
%1051 = vector.fma %1050, %638, %1047 : vector<4xf32>
%1052 = vector.extract_strided_slice %625 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1053 = vector.extract %1052[0] : vector<1xf32>
%1054 = splat %1053 : vector<4xf32>
%1055 = vector.fma %1054, %639, %1051 : vector<4xf32>
%1056 = vector.extract_strided_slice %626 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1057 = vector.extract %1056[0] : vector<1xf32>
%1058 = splat %1057 : vector<4xf32>
%1059 = vector.fma %1058, %632, %arg16 : vector<4xf32>
%1060 = vector.extract_strided_slice %626 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1061 = vector.extract %1060[0] : vector<1xf32>
%1062 = splat %1061 : vector<4xf32>
%1063 = vector.fma %1062, %633, %1059 : vector<4xf32>
%1064 = vector.extract_strided_slice %626 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1065 = vector.extract %1064[0] : vector<1xf32>
%1066 = splat %1065 : vector<4xf32>
%1067 = vector.fma %1066, %634, %1063 : vector<4xf32>
%1068 = vector.extract_strided_slice %626 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1069 = vector.extract %1068[0] : vector<1xf32>
%1070 = splat %1069 : vector<4xf32>
%1071 = vector.fma %1070, %635, %1067 : vector<4xf32>
%1072 = vector.extract_strided_slice %627 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1073 = vector.extract %1072[0] : vector<1xf32>
%1074 = splat %1073 : vector<4xf32>
%1075 = vector.fma %1074, %636, %1071 : vector<4xf32>
%1076 = vector.extract_strided_slice %627 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1077 = vector.extract %1076[0] : vector<1xf32>
%1078 = splat %1077 : vector<4xf32>
%1079 = vector.fma %1078, %637, %1075 : vector<4xf32>
%1080 = vector.extract_strided_slice %627 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1081 = vector.extract %1080[0] : vector<1xf32>
%1082 = splat %1081 : vector<4xf32>
%1083 = vector.fma %1082, %638, %1079 : vector<4xf32>
%1084 = vector.extract_strided_slice %627 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1085 = vector.extract %1084[0] : vector<1xf32>
%1086 = splat %1085 : vector<4xf32>
%1087 = vector.fma %1086, %639, %1083 : vector<4xf32>
%1088 = vector.extract_strided_slice %628 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1089 = vector.extract %1088[0] : vector<1xf32>
%1090 = splat %1089 : vector<4xf32>
%1091 = vector.fma %1090, %632, %arg17 : vector<4xf32>
%1092 = vector.extract_strided_slice %628 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1093 = vector.extract %1092[0] : vector<1xf32>
%1094 = splat %1093 : vector<4xf32>
%1095 = vector.fma %1094, %633, %1091 : vector<4xf32>
%1096 = vector.extract_strided_slice %628 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1097 = vector.extract %1096[0] : vector<1xf32>
%1098 = splat %1097 : vector<4xf32>
%1099 = vector.fma %1098, %634, %1095 : vector<4xf32>
%1100 = vector.extract_strided_slice %628 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1101 = vector.extract %1100[0] : vector<1xf32>
%1102 = splat %1101 : vector<4xf32>
%1103 = vector.fma %1102, %635, %1099 : vector<4xf32>
%1104 = vector.extract_strided_slice %629 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1105 = vector.extract %1104[0] : vector<1xf32>
%1106 = splat %1105 : vector<4xf32>
%1107 = vector.fma %1106, %636, %1103 : vector<4xf32>
%1108 = vector.extract_strided_slice %629 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1109 = vector.extract %1108[0] : vector<1xf32>
%1110 = splat %1109 : vector<4xf32>
%1111 = vector.fma %1110, %637, %1107 : vector<4xf32>
%1112 = vector.extract_strided_slice %629 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1113 = vector.extract %1112[0] : vector<1xf32>
%1114 = splat %1113 : vector<4xf32>
%1115 = vector.fma %1114, %638, %1111 : vector<4xf32>
%1116 = vector.extract_strided_slice %629 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1117 = vector.extract %1116[0] : vector<1xf32>
%1118 = splat %1117 : vector<4xf32>
%1119 = vector.fma %1118, %639, %1115 : vector<4xf32>
%1120 = vector.extract_strided_slice %630 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1121 = vector.extract %1120[0] : vector<1xf32>
%1122 = splat %1121 : vector<4xf32>
%1123 = vector.fma %1122, %632, %arg18 : vector<4xf32>
%1124 = vector.extract_strided_slice %630 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1125 = vector.extract %1124[0] : vector<1xf32>
%1126 = splat %1125 : vector<4xf32>
%1127 = vector.fma %1126, %633, %1123 : vector<4xf32>
%1128 = vector.extract_strided_slice %630 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1129 = vector.extract %1128[0] : vector<1xf32>
%1130 = splat %1129 : vector<4xf32>
%1131 = vector.fma %1130, %634, %1127 : vector<4xf32>
%1132 = vector.extract_strided_slice %630 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1133 = vector.extract %1132[0] : vector<1xf32>
%1134 = splat %1133 : vector<4xf32>
%1135 = vector.fma %1134, %635, %1131 : vector<4xf32>
%1136 = vector.extract_strided_slice %631 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1137 = vector.extract %1136[0] : vector<1xf32>
%1138 = splat %1137 : vector<4xf32>
%1139 = vector.fma %1138, %636, %1135 : vector<4xf32>
%1140 = vector.extract_strided_slice %631 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1141 = vector.extract %1140[0] : vector<1xf32>
%1142 = splat %1141 : vector<4xf32>
%1143 = vector.fma %1142, %637, %1139 : vector<4xf32>
%1144 = vector.extract_strided_slice %631 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1145 = vector.extract %1144[0] : vector<1xf32>
%1146 = splat %1145 : vector<4xf32>
%1147 = vector.fma %1146, %638, %1143 : vector<4xf32>
%1148 = vector.extract_strided_slice %631 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%1149 = vector.extract %1148[0] : vector<1xf32>
%1150 = splat %1149 : vector<4xf32>
%1151 = vector.fma %1150, %639, %1147 : vector<4xf32>
%1152 = addi %arg2, %c8 : index
%1153 = memref.subview %38[0, %1152] [64, 8] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%1154 = memref.subview %39[%1152, 0] [8, 128] [1, 1] : memref<1024x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
%1155 = vector.transfer_read %1153[%24, %26], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<64x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
%1156 = vector.transfer_read %1154[%32, %34], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
%1157 = vector.transfer_read %1154[%35, %34], %cst_0 {__pipelining_global_load__, in_bounds = [true]} : memref<8x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>, vector<4xf32>
scf.yield %671, %703, %735, %767, %799, %831, %863, %895, %927, %959, %991, %1023, %1055, %1087, %1119, %1151, %1155, %1156, %1157 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
gpu.barrier
vector.transfer_write %47#16, %14[%24, %26] {in_bounds = [true]} : vector<4xf32>, memref<64x8xf32, affine_map<(d0, d1) -> (d0 * 8 + d1)>, 3>
vector.transfer_write %47#17, %15[%32, %34] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
vector.transfer_write %47#18, %15[%35, %34] {in_bounds = [true]} : vector<4xf32>, memref<8x128xf32, affine_map<(d0, d1) -> (d0 * 128 + d1)>, 3>
gpu.barrier
%48 = vector.transfer_read %36[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%49 = vector.transfer_read %36[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%50 = vector.transfer_read %36[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%51 = vector.transfer_read %36[%c1, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%52 = vector.transfer_read %36[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%53 = vector.transfer_read %36[%c2, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%54 = vector.transfer_read %36[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%55 = vector.transfer_read %36[%c3, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%56 = vector.transfer_read %36[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%57 = vector.transfer_read %36[%c4, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%58 = vector.transfer_read %36[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%59 = vector.transfer_read %36[%c5, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%60 = vector.transfer_read %36[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%61 = vector.transfer_read %36[%c6, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%62 = vector.transfer_read %36[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%63 = vector.transfer_read %36[%c7, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%64 = vector.transfer_read %36[%c8, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%65 = vector.transfer_read %36[%c8, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%66 = vector.transfer_read %36[%c9, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%67 = vector.transfer_read %36[%c9, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%68 = vector.transfer_read %36[%c10, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%69 = vector.transfer_read %36[%c10, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%70 = vector.transfer_read %36[%c11, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%71 = vector.transfer_read %36[%c11, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%72 = vector.transfer_read %36[%c12, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%73 = vector.transfer_read %36[%c12, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%74 = vector.transfer_read %36[%c13, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%75 = vector.transfer_read %36[%c13, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%76 = vector.transfer_read %36[%c14, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%77 = vector.transfer_read %36[%c14, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%78 = vector.transfer_read %36[%c15, %c0], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%79 = vector.transfer_read %36[%c15, %c4], %cst_0 {in_bounds = [true]} : memref<16x8xf32, affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>, 3>, vector<4xf32>
%80 = vector.transfer_read %37[%c0, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%81 = vector.transfer_read %37[%c1, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%82 = vector.transfer_read %37[%c2, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%83 = vector.transfer_read %37[%c3, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%84 = vector.transfer_read %37[%c4, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%85 = vector.transfer_read %37[%c5, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%86 = vector.transfer_read %37[%c6, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%87 = vector.transfer_read %37[%c7, %c0], %cst_0 {in_bounds = [true]} : memref<8x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>, 3>, vector<4xf32>
%88 = vector.extract_strided_slice %48 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%89 = vector.extract %88[0] : vector<1xf32>
%90 = splat %89 : vector<4xf32>
%91 = vector.fma %90, %80, %47#0 : vector<4xf32>
%92 = vector.extract_strided_slice %48 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%93 = vector.extract %92[0] : vector<1xf32>
%94 = splat %93 : vector<4xf32>
%95 = vector.fma %94, %81, %91 : vector<4xf32>
%96 = vector.extract_strided_slice %48 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%97 = vector.extract %96[0] : vector<1xf32>
%98 = splat %97 : vector<4xf32>
%99 = vector.fma %98, %82, %95 : vector<4xf32>
%100 = vector.extract_strided_slice %48 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%101 = vector.extract %100[0] : vector<1xf32>
%102 = splat %101 : vector<4xf32>
%103 = vector.fma %102, %83, %99 : vector<4xf32>
%104 = vector.extract_strided_slice %49 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%105 = vector.extract %104[0] : vector<1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %84, %103 : vector<4xf32>
%108 = vector.extract_strided_slice %49 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%109 = vector.extract %108[0] : vector<1xf32>
%110 = splat %109 : vector<4xf32>
%111 = vector.fma %110, %85, %107 : vector<4xf32>
%112 = vector.extract_strided_slice %49 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%113 = vector.extract %112[0] : vector<1xf32>
%114 = splat %113 : vector<4xf32>
%115 = vector.fma %114, %86, %111 : vector<4xf32>
%116 = vector.extract_strided_slice %49 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%117 = vector.extract %116[0] : vector<1xf32>
%118 = splat %117 : vector<4xf32>
%119 = vector.fma %118, %87, %115 : vector<4xf32>
%120 = vector.extract_strided_slice %50 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%121 = vector.extract %120[0] : vector<1xf32>
%122 = splat %121 : vector<4xf32>
%123 = vector.fma %122, %80, %47#1 : vector<4xf32>
%124 = vector.extract_strided_slice %50 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%125 = vector.extract %124[0] : vector<1xf32>
%126 = splat %125 : vector<4xf32>
%127 = vector.fma %126, %81, %123 : vector<4xf32>
%128 = vector.extract_strided_slice %50 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%129 = vector.extract %128[0] : vector<1xf32>
%130 = splat %129 : vector<4xf32>
%131 = vector.fma %130, %82, %127 : vector<4xf32>
%132 = vector.extract_strided_slice %50 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%133 = vector.extract %132[0] : vector<1xf32>
%134 = splat %133 : vector<4xf32>
%135 = vector.fma %134, %83, %131 : vector<4xf32>
%136 = vector.extract_strided_slice %51 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%137 = vector.extract %136[0] : vector<1xf32>
%138 = splat %137 : vector<4xf32>
%139 = vector.fma %138, %84, %135 : vector<4xf32>
%140 = vector.extract_strided_slice %51 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%141 = vector.extract %140[0] : vector<1xf32>
%142 = splat %141 : vector<4xf32>
%143 = vector.fma %142, %85, %139 : vector<4xf32>
%144 = vector.extract_strided_slice %51 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%145 = vector.extract %144[0] : vector<1xf32>
%146 = splat %145 : vector<4xf32>
%147 = vector.fma %146, %86, %143 : vector<4xf32>
%148 = vector.extract_strided_slice %51 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%149 = vector.extract %148[0] : vector<1xf32>
%150 = splat %149 : vector<4xf32>
%151 = vector.fma %150, %87, %147 : vector<4xf32>
%152 = vector.extract_strided_slice %52 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%153 = vector.extract %152[0] : vector<1xf32>
%154 = splat %153 : vector<4xf32>
%155 = vector.fma %154, %80, %47#2 : vector<4xf32>
%156 = vector.extract_strided_slice %52 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%157 = vector.extract %156[0] : vector<1xf32>
%158 = splat %157 : vector<4xf32>
%159 = vector.fma %158, %81, %155 : vector<4xf32>
%160 = vector.extract_strided_slice %52 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%161 = vector.extract %160[0] : vector<1xf32>
%162 = splat %161 : vector<4xf32>
%163 = vector.fma %162, %82, %159 : vector<4xf32>
%164 = vector.extract_strided_slice %52 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%165 = vector.extract %164[0] : vector<1xf32>
%166 = splat %165 : vector<4xf32>
%167 = vector.fma %166, %83, %163 : vector<4xf32>
%168 = vector.extract_strided_slice %53 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%169 = vector.extract %168[0] : vector<1xf32>
%170 = splat %169 : vector<4xf32>
%171 = vector.fma %170, %84, %167 : vector<4xf32>
%172 = vector.extract_strided_slice %53 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%173 = vector.extract %172[0] : vector<1xf32>
%174 = splat %173 : vector<4xf32>
%175 = vector.fma %174, %85, %171 : vector<4xf32>
%176 = vector.extract_strided_slice %53 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%177 = vector.extract %176[0] : vector<1xf32>
%178 = splat %177 : vector<4xf32>
%179 = vector.fma %178, %86, %175 : vector<4xf32>
%180 = vector.extract_strided_slice %53 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%181 = vector.extract %180[0] : vector<1xf32>
%182 = splat %181 : vector<4xf32>
%183 = vector.fma %182, %87, %179 : vector<4xf32>
%184 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%185 = vector.extract %184[0] : vector<1xf32>
%186 = splat %185 : vector<4xf32>
%187 = vector.fma %186, %80, %47#3 : vector<4xf32>
%188 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%189 = vector.extract %188[0] : vector<1xf32>
%190 = splat %189 : vector<4xf32>
%191 = vector.fma %190, %81, %187 : vector<4xf32>
%192 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%193 = vector.extract %192[0] : vector<1xf32>
%194 = splat %193 : vector<4xf32>
%195 = vector.fma %194, %82, %191 : vector<4xf32>
%196 = vector.extract_strided_slice %54 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%197 = vector.extract %196[0] : vector<1xf32>
%198 = splat %197 : vector<4xf32>
%199 = vector.fma %198, %83, %195 : vector<4xf32>
%200 = vector.extract_strided_slice %55 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%201 = vector.extract %200[0] : vector<1xf32>
%202 = splat %201 : vector<4xf32>
%203 = vector.fma %202, %84, %199 : vector<4xf32>
%204 = vector.extract_strided_slice %55 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%205 = vector.extract %204[0] : vector<1xf32>
%206 = splat %205 : vector<4xf32>
%207 = vector.fma %206, %85, %203 : vector<4xf32>
%208 = vector.extract_strided_slice %55 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%209 = vector.extract %208[0] : vector<1xf32>
%210 = splat %209 : vector<4xf32>
%211 = vector.fma %210, %86, %207 : vector<4xf32>
%212 = vector.extract_strided_slice %55 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%213 = vector.extract %212[0] : vector<1xf32>
%214 = splat %213 : vector<4xf32>
%215 = vector.fma %214, %87, %211 : vector<4xf32>
%216 = vector.extract_strided_slice %56 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%217 = vector.extract %216[0] : vector<1xf32>
%218 = splat %217 : vector<4xf32>
%219 = vector.fma %218, %80, %47#4 : vector<4xf32>
%220 = vector.extract_strided_slice %56 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%221 = vector.extract %220[0] : vector<1xf32>
%222 = splat %221 : vector<4xf32>
%223 = vector.fma %222, %81, %219 : vector<4xf32>
%224 = vector.extract_strided_slice %56 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%225 = vector.extract %224[0] : vector<1xf32>
%226 = splat %225 : vector<4xf32>
%227 = vector.fma %226, %82, %223 : vector<4xf32>
%228 = vector.extract_strided_slice %56 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%229 = vector.extract %228[0] : vector<1xf32>
%230 = splat %229 : vector<4xf32>
%231 = vector.fma %230, %83, %227 : vector<4xf32>
%232 = vector.extract_strided_slice %57 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%233 = vector.extract %232[0] : vector<1xf32>
%234 = splat %233 : vector<4xf32>
%235 = vector.fma %234, %84, %231 : vector<4xf32>
%236 = vector.extract_strided_slice %57 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%237 = vector.extract %236[0] : vector<1xf32>
%238 = splat %237 : vector<4xf32>
%239 = vector.fma %238, %85, %235 : vector<4xf32>
%240 = vector.extract_strided_slice %57 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%241 = vector.extract %240[0] : vector<1xf32>
%242 = splat %241 : vector<4xf32>
%243 = vector.fma %242, %86, %239 : vector<4xf32>
%244 = vector.extract_strided_slice %57 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%245 = vector.extract %244[0] : vector<1xf32>
%246 = splat %245 : vector<4xf32>
%247 = vector.fma %246, %87, %243 : vector<4xf32>
%248 = vector.extract_strided_slice %58 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%249 = vector.extract %248[0] : vector<1xf32>
%250 = splat %249 : vector<4xf32>
%251 = vector.fma %250, %80, %47#5 : vector<4xf32>
%252 = vector.extract_strided_slice %58 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%253 = vector.extract %252[0] : vector<1xf32>
%254 = splat %253 : vector<4xf32>
%255 = vector.fma %254, %81, %251 : vector<4xf32>
%256 = vector.extract_strided_slice %58 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%257 = vector.extract %256[0] : vector<1xf32>
%258 = splat %257 : vector<4xf32>
%259 = vector.fma %258, %82, %255 : vector<4xf32>
%260 = vector.extract_strided_slice %58 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%261 = vector.extract %260[0] : vector<1xf32>
%262 = splat %261 : vector<4xf32>
%263 = vector.fma %262, %83, %259 : vector<4xf32>
%264 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%265 = vector.extract %264[0] : vector<1xf32>
%266 = splat %265 : vector<4xf32>
%267 = vector.fma %266, %84, %263 : vector<4xf32>
%268 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%269 = vector.extract %268[0] : vector<1xf32>
%270 = splat %269 : vector<4xf32>
%271 = vector.fma %270, %85, %267 : vector<4xf32>
%272 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%273 = vector.extract %272[0] : vector<1xf32>
%274 = splat %273 : vector<4xf32>
%275 = vector.fma %274, %86, %271 : vector<4xf32>
%276 = vector.extract_strided_slice %59 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%277 = vector.extract %276[0] : vector<1xf32>
%278 = splat %277 : vector<4xf32>
%279 = vector.fma %278, %87, %275 : vector<4xf32>
%280 = vector.extract_strided_slice %60 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%281 = vector.extract %280[0] : vector<1xf32>
%282 = splat %281 : vector<4xf32>
%283 = vector.fma %282, %80, %47#6 : vector<4xf32>
%284 = vector.extract_strided_slice %60 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%285 = vector.extract %284[0] : vector<1xf32>
%286 = splat %285 : vector<4xf32>
%287 = vector.fma %286, %81, %283 : vector<4xf32>
%288 = vector.extract_strided_slice %60 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%289 = vector.extract %288[0] : vector<1xf32>
%290 = splat %289 : vector<4xf32>
%291 = vector.fma %290, %82, %287 : vector<4xf32>
%292 = vector.extract_strided_slice %60 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%293 = vector.extract %292[0] : vector<1xf32>
%294 = splat %293 : vector<4xf32>
%295 = vector.fma %294, %83, %291 : vector<4xf32>
%296 = vector.extract_strided_slice %61 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%297 = vector.extract %296[0] : vector<1xf32>
%298 = splat %297 : vector<4xf32>
%299 = vector.fma %298, %84, %295 : vector<4xf32>
%300 = vector.extract_strided_slice %61 {offsets = [1], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%301 = vector.extract %300[0] : vector<1xf32>
%302 = splat %301 : vector<4xf32>
%303 = vector.fma %302, %85, %299 : vector<4xf32>
%304 = vector.extract_strided_slice %61 {offsets = [2], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%305 = vector.extract %304[0] : vector<1xf32>
%306 = splat %305 : vector<4xf32>
%307 = vector.fma %306, %86, %303 : vector<4xf32>
%308 = vector.extract_strided_slice %61 {offsets = [3], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%309 = vector.extract %308[0] : vector<1xf32>
%310 = splat %309 : vector<4xf32>
%311 = vector.fma %310, %87, %307 : vector<4xf32>
%312 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<4xf32> to vector<1xf32>
%313 = vector.extract %312[0] : vector<1xf32>
%314 = splat %313 : vector<4xf32>
%315 = vector.fma %314, %80, %47#7 : vector<4xf32>
%316 = vector.extra
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment