Skip to content

Instantly share code, notes, and snippets.

@IanWood1
Created June 27, 2024 06:41
Show Gist options
  • Save IanWood1/06a70ff2c6e4ac9e6ea6b806836cddc8 to your computer and use it in GitHub Desktop.
Save IanWood1/06a70ff2c6e4ac9e6ea6b806836cddc8 to your computer and use it in GitHub Desktop.
// -----// IR Dump After SinkReshapesPass (iree-flow-sink-reshapes) //----- //
util.func public @forward(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @forward(%input0: tensor<1x4xi32>, %input1: tensor<1x4xi32>) -> (%output0: tensor<1x4x50257xf32>, %output1: tensor<12x2x1x12x4x64xf32>)"}} {
%cst = arith.constant dense_resource<__elided__> : tensor<4x4xf32>
%cst_0 = arith.constant dense_resource<__elided__> : tensor<4x4xf32>
%cst_1 = arith.constant dense_resource<__elided__> : tensor<1x4x768xf32>
%cst_2 = arith.constant 4.471500e-02 : f32
%cst_3 = arith.constant 0.797884583 : f32
%cst_4 = arith.constant 5.000000e-01 : f32
%cst_5 = arith.constant 3.000000e+00 : f32
%cst_6 = arith.constant 9.99999971E-10 : f32
%cst_7 = arith.constant 1.250000e-01 : f32
%cst_8 = arith.constant 9.99999974E-6 : f32
%cst_9 = arith.constant 7.680000e+02 : f32
%cst_10 = arith.constant -1.000000e+04 : f32
%cst_11 = arith.constant 1.000000e+00 : f32
%cst_12 = arith.constant dense_resource<__elided__> : tensor<50257x768xf32>
%cst_13 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_14 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_15 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_16 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_17 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_18 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_19 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_20 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_21 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_22 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_23 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_24 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_25 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_26 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_27 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_28 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_29 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_30 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_31 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_32 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_33 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_34 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_35 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_36 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_37 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_38 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_39 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_40 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_41 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_42 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_43 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_44 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_45 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_46 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_47 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_48 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_49 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_50 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_51 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_52 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_53 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_54 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_55 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_56 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_57 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_58 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_59 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_60 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_61 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_62 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_63 = arith.constant -0.000000e+00 : f32
%cst_64 = arith.constant 0.000000e+00 : f32
%cst_65 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_66 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_67 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_68 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_69 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_70 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_71 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_72 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_73 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_74 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_75 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_76 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_77 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_78 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_79 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_80 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_81 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_82 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_83 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_84 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_85 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_86 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_87 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_88 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_89 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_90 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_91 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_92 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_93 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_94 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_95 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_96 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_97 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_98 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_99 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_100 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_101 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_102 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_103 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_104 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_105 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_106 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_107 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_108 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_109 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
%cst_110 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_111 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
%cst_112 = arith.constant dense_resource<__elided__> : tensor<768xf32>
%cst_113 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_114 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_115 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_116 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_117 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_118 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_119 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_120 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_121 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_122 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_123 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_124 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_125 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_126 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_127 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_128 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_129 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_130 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_131 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_132 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_133 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_134 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_135 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_136 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_137 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_138 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_139 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_140 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_141 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_142 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_143 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_144 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_145 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_146 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_147 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_148 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_149 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_150 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_151 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_152 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_153 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_154 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_155 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_156 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_157 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
%cst_158 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
%cst_159 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
%cst_160 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
%cst_161 = arith.constant dense_resource<__elided__> : tensor<3142x768x16x1xf32>
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<1x4xi32>
%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<1x4xi32>
%2 = tensor.empty() : tensor<4xf32>
%3 = tensor.empty() : tensor<1x4xf32>
%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<1x4xi32>) outs(%3 : tensor<1x4xf32>) {
^bb0(%in: i32, %out: f32):
%380 = arith.sitofp %in : i32 to f32
%381 = arith.subf %cst_11, %380 : f32
%382 = arith.mulf %381, %cst_10 : f32
linalg.yield %382 : f32
} -> tensor<1x4xf32>
%5 = tensor.empty() : tensor<12x4x4xf32>
%6 = tensor.empty() : tensor<4x768xf32>
%7 = tensor.empty() : tensor<1x4x768xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %cst_1 : tensor<1x4xi32>, tensor<1x4x768xf32>) outs(%7 : tensor<1x4x768xf32>) {
^bb0(%in: i32, %in_486: f32, %out: f32):
%380 = linalg.index 2 : index
%381 = arith.index_cast %in : i32 to index
%extracted = tensor.extract %cst_12[%381, %380] : tensor<50257x768xf32>
%382 = arith.addf %extracted, %in_486 : f32
linalg.yield %382 : f32
} -> tensor<1x4x768xf32>
%collapsed = tensor.collapse_shape %8 [[0, 1], [2]] : tensor<1x4x768xf32> into tensor<4x768xf32>
%9 = linalg.fill ins(%cst_63 : f32) outs(%2 : tensor<4xf32>) -> tensor<4xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed, %11 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%collapsed_162 = tensor.collapse_shape %8 [[0, 1], [2]] : tensor<1x4x768xf32> into tensor<4x768xf32>
%13 = tensor.empty() : tensor<4x768xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_162, %12, %cst_62, %cst_61, %11 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%13 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%15 = tensor.empty() : tensor<4x2304xf32>
%16 = tensor.empty() : tensor<1x768x4x1xf32>
%pack = tensor.pack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%17 = tensor.empty() : tensor<1x144x4x16xf32>
%18 = linalg.fill ins(%cst_64 : f32) outs(%17 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%19 = linalg.mmt4d ins(%pack, %cst_113 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%20 = tensor.empty() : tensor<4x2304xf32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack, %cst_65 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%20 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded = tensor.expand_shape %21 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice = tensor.extract_slice %expanded[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_163 = tensor.extract_slice %expanded[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_164 = tensor.extract_slice %expanded[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_165 = tensor.expand_shape %extracted_slice [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%22 = tensor.empty() : tensor<12x4x64xf32>
%expanded_166 = tensor.expand_shape %extracted_slice_163 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_166 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%24 = tensor.empty() : tensor<12x1x64x4x1xf32>
%pack_167 = tensor.pack %expanded_165 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%25 = tensor.empty() : tensor<12x1x64x16x1xf32>
%pack_168 = tensor.pack %expanded_166 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%26 = tensor.empty() : tensor<12x1x1x4x16xf32>
%27 = linalg.fill ins(%cst_64 : f32) outs(%26 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%28 = linalg.batch_mmt4d ins(%pack_167, %pack_168 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_169 = tensor.unpack %28 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_170 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%29 = tensor.empty() : tensor<12x4x4xf32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_169, %cst_0, %cst, %collapsed_170 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%29 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%31 = linalg.softmax dimension(2) ins(%30 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_171 = tensor.expand_shape %extracted_slice_164 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_171 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%33 = tensor.empty() : tensor<12x1x4x4x1xf32>
%pack_172 = tensor.pack %31 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%34 = tensor.empty() : tensor<12x4x4x16x1xf32>
%pack_173 = tensor.pack %expanded_171 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%35 = tensor.empty() : tensor<12x1x4x4x16xf32>
%36 = linalg.fill ins(%cst_64 : f32) outs(%35 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%37 = linalg.batch_mmt4d ins(%pack_172, %pack_173 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%38 = tensor.empty() : tensor<4x12x64xf32>
%unpack_174 = tensor.unpack %37 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%39 = tensor.empty() : tensor<1x12x64x4x1xf32>
%pack_175 = tensor.pack %unpack_174 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_176 = tensor.collapse_shape %pack_175 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%40 = tensor.empty() : tensor<1x48x4x16xf32>
%41 = linalg.fill ins(%cst_64 : f32) outs(%40 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%42 = linalg.mmt4d ins(%collapsed_176, %cst_114 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_177 = tensor.unpack %42 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%collapsed_178 = tensor.collapse_shape %8 [[0, 1], [2]] : tensor<1x4x768xf32> into tensor<4x768xf32>
%43 = tensor.empty() : tensor<4x768xf32>
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_178, %unpack_177, %cst_66 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%43 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%44 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%46 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%44, %46 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%48 = tensor.empty() : tensor<4x768xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %47, %cst_60, %cst_59, %46 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%48 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%50 = tensor.empty() : tensor<4x3072xf32>
%pack_179 = tensor.pack %49 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%51 = tensor.empty() : tensor<1x192x4x16xf32>
%52 = linalg.fill ins(%cst_64 : f32) outs(%51 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%53 = linalg.mmt4d ins(%pack_179, %cst_115 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_180 = tensor.unpack %53 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_180, %cst_67 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%55 = tensor.empty() : tensor<1x3072x4x1xf32>
%pack_181 = tensor.pack %54 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%56 = linalg.mmt4d ins(%pack_181, %cst_116 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_182 = tensor.unpack %56 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%57 = tensor.empty() : tensor<4x768xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %unpack_182, %cst_68 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%57 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%58 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%60 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%59 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%61 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%58, %60 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%62 = tensor.empty() : tensor<4x768xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %61, %cst_50, %cst_49, %60 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%62 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_183 = tensor.pack %63 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%64 = linalg.mmt4d ins(%pack_183, %cst_117 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_184 = tensor.unpack %64 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%65 = tensor.empty() : tensor<4x2304xf32>
%66 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_184, %cst_69 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%65 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_185 = tensor.expand_shape %66 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_186 = tensor.extract_slice %expanded_185[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_187 = tensor.extract_slice %expanded_185[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_188 = tensor.extract_slice %expanded_185[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_189 = tensor.expand_shape %extracted_slice_186 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_190 = tensor.expand_shape %extracted_slice_187 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_190 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_191 = tensor.pack %expanded_189 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_192 = tensor.pack %expanded_190 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%68 = linalg.batch_mmt4d ins(%pack_191, %pack_192 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_193 = tensor.unpack %68 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_194 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%69 = tensor.empty() : tensor<12x4x4xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_193, %cst_0, %cst, %collapsed_194 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%69 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%71 = linalg.softmax dimension(2) ins(%70 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_195 = tensor.expand_shape %extracted_slice_188 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_195 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_196 = tensor.pack %71 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_197 = tensor.pack %expanded_195 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%73 = linalg.batch_mmt4d ins(%pack_196, %pack_197 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_198 = tensor.unpack %73 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_199 = tensor.pack %unpack_198 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_200 = tensor.collapse_shape %pack_199 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%74 = linalg.mmt4d ins(%collapsed_200, %cst_118 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_201 = tensor.unpack %74 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%75 = tensor.empty() : tensor<4x768xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %unpack_201, %cst_70 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%75 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%76 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%76, %78 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%80 = tensor.empty() : tensor<4x768xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %79, %cst_48, %cst_47, %78 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%80 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_202 = tensor.pack %81 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%82 = linalg.mmt4d ins(%pack_202, %cst_119 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_203 = tensor.unpack %82 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_203, %cst_71 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_204 = tensor.pack %83 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%84 = linalg.mmt4d ins(%pack_204, %cst_120 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_205 = tensor.unpack %84 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%85 = tensor.empty() : tensor<4x768xf32>
%86 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %unpack_205, %cst_72 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%85 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%86 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%87 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%86, %88 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%90 = tensor.empty() : tensor<4x768xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %89, %cst_46, %cst_45, %88 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%90 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_206 = tensor.pack %91 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%92 = linalg.mmt4d ins(%pack_206, %cst_121 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_207 = tensor.unpack %92 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%93 = tensor.empty() : tensor<4x2304xf32>
%94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_207, %cst_73 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%93 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_208 = tensor.expand_shape %94 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_209 = tensor.extract_slice %expanded_208[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_210 = tensor.extract_slice %expanded_208[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_211 = tensor.extract_slice %expanded_208[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_212 = tensor.expand_shape %extracted_slice_209 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_213 = tensor.expand_shape %extracted_slice_210 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%95 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_213 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_214 = tensor.pack %expanded_212 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_215 = tensor.pack %expanded_213 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%96 = linalg.batch_mmt4d ins(%pack_214, %pack_215 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_216 = tensor.unpack %96 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_217 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%97 = tensor.empty() : tensor<12x4x4xf32>
%98 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_216, %cst_0, %cst, %collapsed_217 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%97 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%99 = linalg.softmax dimension(2) ins(%98 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_218 = tensor.expand_shape %extracted_slice_211 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_218 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_219 = tensor.pack %99 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_220 = tensor.pack %expanded_218 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%101 = linalg.batch_mmt4d ins(%pack_219, %pack_220 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_221 = tensor.unpack %101 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_222 = tensor.pack %unpack_221 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_223 = tensor.collapse_shape %pack_222 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%102 = linalg.mmt4d ins(%collapsed_223, %cst_122 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_224 = tensor.unpack %102 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%103 = tensor.empty() : tensor<4x768xf32>
%104 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %unpack_224, %cst_74 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%103 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%104 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%106 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%104, %106 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%108 = tensor.empty() : tensor<4x768xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%104, %107, %cst_44, %cst_43, %106 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%108 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_225 = tensor.pack %109 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%110 = linalg.mmt4d ins(%pack_225, %cst_123 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_226 = tensor.unpack %110 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_226, %cst_75 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_227 = tensor.pack %111 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%112 = linalg.mmt4d ins(%pack_227, %cst_124 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_228 = tensor.unpack %112 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%113 = tensor.empty() : tensor<4x768xf32>
%114 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%104, %unpack_228, %cst_76 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%113 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%115 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%114 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%116 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%115 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%117 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%114, %116 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%118 = tensor.empty() : tensor<4x768xf32>
%119 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %117, %cst_42, %cst_41, %116 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%118 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_229 = tensor.pack %119 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%120 = linalg.mmt4d ins(%pack_229, %cst_125 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_230 = tensor.unpack %120 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%121 = tensor.empty() : tensor<4x2304xf32>
%122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_230, %cst_77 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%121 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_231 = tensor.expand_shape %122 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_232 = tensor.extract_slice %expanded_231[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_233 = tensor.extract_slice %expanded_231[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_234 = tensor.extract_slice %expanded_231[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_235 = tensor.expand_shape %extracted_slice_232 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_236 = tensor.expand_shape %extracted_slice_233 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_236 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_237 = tensor.pack %expanded_235 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_238 = tensor.pack %expanded_236 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%124 = linalg.batch_mmt4d ins(%pack_237, %pack_238 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_239 = tensor.unpack %124 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_240 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%125 = tensor.empty() : tensor<12x4x4xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_239, %cst_0, %cst, %collapsed_240 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%125 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%127 = linalg.softmax dimension(2) ins(%126 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_241 = tensor.expand_shape %extracted_slice_234 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_241 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_242 = tensor.pack %127 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_243 = tensor.pack %expanded_241 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%129 = linalg.batch_mmt4d ins(%pack_242, %pack_243 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_244 = tensor.unpack %129 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_245 = tensor.pack %unpack_244 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_246 = tensor.collapse_shape %pack_245 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%130 = linalg.mmt4d ins(%collapsed_246, %cst_126 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_247 = tensor.unpack %130 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%131 = tensor.empty() : tensor<4x768xf32>
%132 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %unpack_247, %cst_78 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%131 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%133 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%132 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%134 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%133 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%135 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%132, %134 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%136 = tensor.empty() : tensor<4x768xf32>
%137 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%132, %135, %cst_40, %cst_39, %134 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%136 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_248 = tensor.pack %137 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%138 = linalg.mmt4d ins(%pack_248, %cst_127 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_249 = tensor.unpack %138 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%139 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_249, %cst_79 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_250 = tensor.pack %139 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%140 = linalg.mmt4d ins(%pack_250, %cst_128 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_251 = tensor.unpack %140 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%141 = tensor.empty() : tensor<4x768xf32>
%142 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%132, %unpack_251, %cst_80 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%141 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%143 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%142 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%144 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%143 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%145 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%142, %144 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%146 = tensor.empty() : tensor<4x768xf32>
%147 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%142, %145, %cst_38, %cst_37, %144 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%146 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_252 = tensor.pack %147 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%148 = linalg.mmt4d ins(%pack_252, %cst_129 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_253 = tensor.unpack %148 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%149 = tensor.empty() : tensor<4x2304xf32>
%150 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_253, %cst_81 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%149 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_254 = tensor.expand_shape %150 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_255 = tensor.extract_slice %expanded_254[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_256 = tensor.extract_slice %expanded_254[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_257 = tensor.extract_slice %expanded_254[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_258 = tensor.expand_shape %extracted_slice_255 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_259 = tensor.expand_shape %extracted_slice_256 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%151 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_259 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_260 = tensor.pack %expanded_258 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_261 = tensor.pack %expanded_259 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%152 = linalg.batch_mmt4d ins(%pack_260, %pack_261 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_262 = tensor.unpack %152 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_263 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%153 = tensor.empty() : tensor<12x4x4xf32>
%154 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_262, %cst_0, %cst, %collapsed_263 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%153 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%155 = linalg.softmax dimension(2) ins(%154 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_264 = tensor.expand_shape %extracted_slice_257 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%156 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_264 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_265 = tensor.pack %155 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_266 = tensor.pack %expanded_264 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%157 = linalg.batch_mmt4d ins(%pack_265, %pack_266 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_267 = tensor.unpack %157 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_268 = tensor.pack %unpack_267 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_269 = tensor.collapse_shape %pack_268 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%158 = linalg.mmt4d ins(%collapsed_269, %cst_130 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_270 = tensor.unpack %158 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%159 = tensor.empty() : tensor<4x768xf32>
%160 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%142, %unpack_270, %cst_82 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%159 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%161 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%160 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%162 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%161 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%163 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%160, %162 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%164 = tensor.empty() : tensor<4x768xf32>
%165 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%160, %163, %cst_36, %cst_35, %162 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%164 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_271 = tensor.pack %165 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%166 = linalg.mmt4d ins(%pack_271, %cst_131 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_272 = tensor.unpack %166 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%167 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_272, %cst_83 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_273 = tensor.pack %167 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%168 = linalg.mmt4d ins(%pack_273, %cst_132 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_274 = tensor.unpack %168 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%169 = tensor.empty() : tensor<4x768xf32>
%170 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%160, %unpack_274, %cst_84 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%169 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%171 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%170 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%172 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%171 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%173 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%170, %172 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%174 = tensor.empty() : tensor<4x768xf32>
%175 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%170, %173, %cst_34, %cst_33, %172 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%174 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_275 = tensor.pack %175 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%176 = linalg.mmt4d ins(%pack_275, %cst_133 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_276 = tensor.unpack %176 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%177 = tensor.empty() : tensor<4x2304xf32>
%178 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_276, %cst_85 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%177 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_277 = tensor.expand_shape %178 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_278 = tensor.extract_slice %expanded_277[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_279 = tensor.extract_slice %expanded_277[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_280 = tensor.extract_slice %expanded_277[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_281 = tensor.expand_shape %extracted_slice_278 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_282 = tensor.expand_shape %extracted_slice_279 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%179 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_282 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_283 = tensor.pack %expanded_281 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_284 = tensor.pack %expanded_282 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%180 = linalg.batch_mmt4d ins(%pack_283, %pack_284 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_285 = tensor.unpack %180 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_286 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%181 = tensor.empty() : tensor<12x4x4xf32>
%182 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_285, %cst_0, %cst, %collapsed_286 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%181 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%183 = linalg.softmax dimension(2) ins(%182 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_287 = tensor.expand_shape %extracted_slice_280 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%184 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_287 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_288 = tensor.pack %183 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_289 = tensor.pack %expanded_287 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%185 = linalg.batch_mmt4d ins(%pack_288, %pack_289 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_290 = tensor.unpack %185 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_291 = tensor.pack %unpack_290 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_292 = tensor.collapse_shape %pack_291 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%186 = linalg.mmt4d ins(%collapsed_292, %cst_134 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_293 = tensor.unpack %186 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%187 = tensor.empty() : tensor<4x768xf32>
%188 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%170, %unpack_293, %cst_86 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%187 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%189 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%188 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%190 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%191 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%188, %190 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%192 = tensor.empty() : tensor<4x768xf32>
%193 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%188, %191, %cst_32, %cst_31, %190 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%192 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_294 = tensor.pack %193 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%194 = linalg.mmt4d ins(%pack_294, %cst_135 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_295 = tensor.unpack %194 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%195 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_295, %cst_87 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_296 = tensor.pack %195 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%196 = linalg.mmt4d ins(%pack_296, %cst_136 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_297 = tensor.unpack %196 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%197 = tensor.empty() : tensor<4x768xf32>
%198 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%188, %unpack_297, %cst_88 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%197 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%199 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%198 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%200 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%199 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%201 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%198, %200 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%202 = tensor.empty() : tensor<4x768xf32>
%203 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%198, %201, %cst_30, %cst_29, %200 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%202 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_298 = tensor.pack %203 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%204 = linalg.mmt4d ins(%pack_298, %cst_137 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_299 = tensor.unpack %204 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%205 = tensor.empty() : tensor<4x2304xf32>
%206 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_299, %cst_89 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%205 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_300 = tensor.expand_shape %206 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_301 = tensor.extract_slice %expanded_300[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_302 = tensor.extract_slice %expanded_300[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_303 = tensor.extract_slice %expanded_300[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_304 = tensor.expand_shape %extracted_slice_301 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_305 = tensor.expand_shape %extracted_slice_302 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%207 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_305 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_306 = tensor.pack %expanded_304 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_307 = tensor.pack %expanded_305 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%208 = linalg.batch_mmt4d ins(%pack_306, %pack_307 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_308 = tensor.unpack %208 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_309 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%209 = tensor.empty() : tensor<12x4x4xf32>
%210 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_308, %cst_0, %cst, %collapsed_309 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%209 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%211 = linalg.softmax dimension(2) ins(%210 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_310 = tensor.expand_shape %extracted_slice_303 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%212 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_310 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_311 = tensor.pack %211 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_312 = tensor.pack %expanded_310 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%213 = linalg.batch_mmt4d ins(%pack_311, %pack_312 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_313 = tensor.unpack %213 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_314 = tensor.pack %unpack_313 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_315 = tensor.collapse_shape %pack_314 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%214 = linalg.mmt4d ins(%collapsed_315, %cst_138 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_316 = tensor.unpack %214 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%215 = tensor.empty() : tensor<4x768xf32>
%216 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%198, %unpack_316, %cst_90 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%215 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%217 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%216 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%218 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%217 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%219 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%216, %218 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%220 = tensor.empty() : tensor<4x768xf32>
%221 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%216, %219, %cst_28, %cst_27, %218 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%220 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_317 = tensor.pack %221 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%222 = linalg.mmt4d ins(%pack_317, %cst_139 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_318 = tensor.unpack %222 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%223 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_318, %cst_91 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_319 = tensor.pack %223 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%224 = linalg.mmt4d ins(%pack_319, %cst_140 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_320 = tensor.unpack %224 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%225 = tensor.empty() : tensor<4x768xf32>
%226 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%216, %unpack_320, %cst_92 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%225 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%227 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%226 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%228 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%227 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%229 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%226, %228 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%230 = tensor.empty() : tensor<4x768xf32>
%231 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%226, %229, %cst_26, %cst_25, %228 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%230 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_321 = tensor.pack %231 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%232 = linalg.mmt4d ins(%pack_321, %cst_141 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_322 = tensor.unpack %232 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%233 = tensor.empty() : tensor<4x2304xf32>
%234 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_322, %cst_93 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%233 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_323 = tensor.expand_shape %234 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_324 = tensor.extract_slice %expanded_323[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_325 = tensor.extract_slice %expanded_323[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_326 = tensor.extract_slice %expanded_323[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_327 = tensor.expand_shape %extracted_slice_324 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_328 = tensor.expand_shape %extracted_slice_325 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%235 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_328 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_329 = tensor.pack %expanded_327 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_330 = tensor.pack %expanded_328 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%236 = linalg.batch_mmt4d ins(%pack_329, %pack_330 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_331 = tensor.unpack %236 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_332 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%237 = tensor.empty() : tensor<12x4x4xf32>
%238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_331, %cst_0, %cst, %collapsed_332 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%237 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%239 = linalg.softmax dimension(2) ins(%238 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_333 = tensor.expand_shape %extracted_slice_326 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%240 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_333 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_334 = tensor.pack %239 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_335 = tensor.pack %expanded_333 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%241 = linalg.batch_mmt4d ins(%pack_334, %pack_335 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_336 = tensor.unpack %241 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_337 = tensor.pack %unpack_336 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_338 = tensor.collapse_shape %pack_337 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%242 = linalg.mmt4d ins(%collapsed_338, %cst_142 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_339 = tensor.unpack %242 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%243 = tensor.empty() : tensor<4x768xf32>
%244 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%226, %unpack_339, %cst_94 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%243 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%245 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%244 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%246 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%245 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%247 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%244, %246 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%248 = tensor.empty() : tensor<4x768xf32>
%249 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%244, %247, %cst_24, %cst_23, %246 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%248 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_340 = tensor.pack %249 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%250 = linalg.mmt4d ins(%pack_340, %cst_143 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_341 = tensor.unpack %250 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%251 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_341, %cst_95 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_342 = tensor.pack %251 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%252 = linalg.mmt4d ins(%pack_342, %cst_144 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_343 = tensor.unpack %252 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%253 = tensor.empty() : tensor<4x768xf32>
%254 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%244, %unpack_343, %cst_96 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%253 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%255 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%254 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%256 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%255 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%257 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%254, %256 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%258 = tensor.empty() : tensor<4x768xf32>
%259 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%254, %257, %cst_22, %cst_21, %256 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%258 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_344 = tensor.pack %259 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%260 = linalg.mmt4d ins(%pack_344, %cst_145 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_345 = tensor.unpack %260 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%261 = tensor.empty() : tensor<4x2304xf32>
%262 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_345, %cst_97 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%261 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_346 = tensor.expand_shape %262 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_347 = tensor.extract_slice %expanded_346[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_348 = tensor.extract_slice %expanded_346[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_349 = tensor.extract_slice %expanded_346[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_350 = tensor.expand_shape %extracted_slice_347 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_351 = tensor.expand_shape %extracted_slice_348 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%263 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_351 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_352 = tensor.pack %expanded_350 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_353 = tensor.pack %expanded_351 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%264 = linalg.batch_mmt4d ins(%pack_352, %pack_353 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_354 = tensor.unpack %264 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_355 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%265 = tensor.empty() : tensor<12x4x4xf32>
%266 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_354, %cst_0, %cst, %collapsed_355 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%265 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%267 = linalg.softmax dimension(2) ins(%266 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_356 = tensor.expand_shape %extracted_slice_349 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%268 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_356 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_357 = tensor.pack %267 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_358 = tensor.pack %expanded_356 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%269 = linalg.batch_mmt4d ins(%pack_357, %pack_358 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_359 = tensor.unpack %269 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_360 = tensor.pack %unpack_359 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_361 = tensor.collapse_shape %pack_360 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%270 = linalg.mmt4d ins(%collapsed_361, %cst_146 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_362 = tensor.unpack %270 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%271 = tensor.empty() : tensor<4x768xf32>
%272 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%254, %unpack_362, %cst_98 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%271 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%273 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%272 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%274 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%273 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%275 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%272, %274 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%276 = tensor.empty() : tensor<4x768xf32>
%277 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%272, %275, %cst_20, %cst_19, %274 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%276 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_363 = tensor.pack %277 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%278 = linalg.mmt4d ins(%pack_363, %cst_147 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_364 = tensor.unpack %278 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%279 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_364, %cst_99 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_365 = tensor.pack %279 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%280 = linalg.mmt4d ins(%pack_365, %cst_148 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_366 = tensor.unpack %280 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%281 = tensor.empty() : tensor<4x768xf32>
%282 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%272, %unpack_366, %cst_100 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%281 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%283 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%282 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%284 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%283 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%285 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%282, %284 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%286 = tensor.empty() : tensor<4x768xf32>
%287 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%282, %285, %cst_18, %cst_17, %284 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%286 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_367 = tensor.pack %287 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%288 = linalg.mmt4d ins(%pack_367, %cst_149 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_368 = tensor.unpack %288 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%289 = tensor.empty() : tensor<4x2304xf32>
%290 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_368, %cst_101 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%289 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_369 = tensor.expand_shape %290 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_370 = tensor.extract_slice %expanded_369[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_371 = tensor.extract_slice %expanded_369[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_372 = tensor.extract_slice %expanded_369[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_373 = tensor.expand_shape %extracted_slice_370 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_374 = tensor.expand_shape %extracted_slice_371 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%291 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_374 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_375 = tensor.pack %expanded_373 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_376 = tensor.pack %expanded_374 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%292 = linalg.batch_mmt4d ins(%pack_375, %pack_376 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_377 = tensor.unpack %292 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_378 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%293 = tensor.empty() : tensor<12x4x4xf32>
%294 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_377, %cst_0, %cst, %collapsed_378 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%293 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%295 = linalg.softmax dimension(2) ins(%294 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_379 = tensor.expand_shape %extracted_slice_372 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_379 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_380 = tensor.pack %295 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_381 = tensor.pack %expanded_379 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%297 = linalg.batch_mmt4d ins(%pack_380, %pack_381 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_382 = tensor.unpack %297 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_383 = tensor.pack %unpack_382 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_384 = tensor.collapse_shape %pack_383 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%298 = linalg.mmt4d ins(%collapsed_384, %cst_150 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_385 = tensor.unpack %298 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%299 = tensor.empty() : tensor<4x768xf32>
%300 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%282, %unpack_385, %cst_102 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%299 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%301 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%300 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%302 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%301 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%303 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%300, %302 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%304 = tensor.empty() : tensor<4x768xf32>
%305 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%300, %303, %cst_16, %cst_15, %302 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%304 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_386 = tensor.pack %305 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%306 = linalg.mmt4d ins(%pack_386, %cst_151 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_387 = tensor.unpack %306 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%307 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_387, %cst_103 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_388 = tensor.pack %307 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%308 = linalg.mmt4d ins(%pack_388, %cst_152 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_389 = tensor.unpack %308 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%309 = tensor.empty() : tensor<4x768xf32>
%310 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%300, %unpack_389, %cst_104 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%309 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%311 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%310 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%312 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%311 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%313 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%310, %312 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%314 = tensor.empty() : tensor<4x768xf32>
%315 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%310, %313, %cst_58, %cst_57, %312 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%314 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_390 = tensor.pack %315 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%316 = linalg.mmt4d ins(%pack_390, %cst_153 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_391 = tensor.unpack %316 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%317 = tensor.empty() : tensor<4x2304xf32>
%318 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_391, %cst_105 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%317 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_392 = tensor.expand_shape %318 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_393 = tensor.extract_slice %expanded_392[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_394 = tensor.extract_slice %expanded_392[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_395 = tensor.extract_slice %expanded_392[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_396 = tensor.expand_shape %extracted_slice_393 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_397 = tensor.expand_shape %extracted_slice_394 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%319 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_397 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_398 = tensor.pack %expanded_396 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_399 = tensor.pack %expanded_397 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%320 = linalg.batch_mmt4d ins(%pack_398, %pack_399 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_400 = tensor.unpack %320 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_401 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%321 = tensor.empty() : tensor<12x4x4xf32>
%322 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_400, %cst_0, %cst, %collapsed_401 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%321 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%323 = linalg.softmax dimension(2) ins(%322 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_402 = tensor.expand_shape %extracted_slice_395 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%324 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_402 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_403 = tensor.pack %323 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_404 = tensor.pack %expanded_402 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%325 = linalg.batch_mmt4d ins(%pack_403, %pack_404 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_405 = tensor.unpack %325 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_406 = tensor.pack %unpack_405 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_407 = tensor.collapse_shape %pack_406 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%326 = linalg.mmt4d ins(%collapsed_407, %cst_154 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_408 = tensor.unpack %326 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%327 = tensor.empty() : tensor<4x768xf32>
%328 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%310, %unpack_408, %cst_106 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%327 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%329 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%328 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%330 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%329 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%331 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%328, %330 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%332 = tensor.empty() : tensor<4x768xf32>
%333 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%328, %331, %cst_56, %cst_55, %330 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%332 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_409 = tensor.pack %333 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%334 = linalg.mmt4d ins(%pack_409, %cst_155 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_410 = tensor.unpack %334 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%335 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_410, %cst_107 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_411 = tensor.pack %335 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%336 = linalg.mmt4d ins(%pack_411, %cst_156 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_412 = tensor.unpack %336 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%337 = tensor.empty() : tensor<4x768xf32>
%338 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%328, %unpack_412, %cst_108 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%337 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%339 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%338 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%340 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%339 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%341 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%338, %340 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%342 = tensor.empty() : tensor<4x768xf32>
%343 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%338, %341, %cst_54, %cst_53, %340 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%342 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_413 = tensor.pack %343 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%344 = linalg.mmt4d ins(%pack_413, %cst_157 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
%unpack_414 = tensor.unpack %344 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
%345 = tensor.empty() : tensor<4x2304xf32>
%346 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_414, %cst_109 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%345 : tensor<4x2304xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
linalg.yield %380 : f32
} -> tensor<4x2304xf32>
%expanded_415 = tensor.expand_shape %346 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
%extracted_slice_416 = tensor.extract_slice %expanded_415[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_417 = tensor.extract_slice %expanded_415[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%extracted_slice_418 = tensor.extract_slice %expanded_415[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
%expanded_419 = tensor.expand_shape %extracted_slice_416 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%expanded_420 = tensor.expand_shape %extracted_slice_417 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%347 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_420 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_421 = tensor.pack %expanded_419 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
%pack_422 = tensor.pack %expanded_420 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
%348 = linalg.batch_mmt4d ins(%pack_421, %pack_422 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
%unpack_423 = tensor.unpack %348 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
%collapsed_424 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
%349 = tensor.empty() : tensor<12x4x4xf32>
%350 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_423, %cst_0, %cst, %collapsed_424 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%349 : tensor<12x4x4xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
%380 = arith.mulf %in, %cst_7 : f32
%381 = arith.mulf %380, %in_486 : f32
%382 = arith.subf %381, %in_487 : f32
%383 = arith.addf %382, %in_488 : f32
%384 = arith.addf %383, %cst_6 : f32
linalg.yield %384 : f32
} -> tensor<12x4x4xf32>
%351 = linalg.softmax dimension(2) ins(%350 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
%expanded_425 = tensor.expand_shape %extracted_slice_418 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
%352 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_425 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<12x4x64xf32>
%pack_426 = tensor.pack %351 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
%pack_427 = tensor.pack %expanded_425 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
%353 = linalg.batch_mmt4d ins(%pack_426, %pack_427 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
%unpack_428 = tensor.unpack %353 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
%pack_429 = tensor.pack %unpack_428 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
%collapsed_430 = tensor.collapse_shape %pack_429 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
%354 = linalg.mmt4d ins(%collapsed_430, %cst_158 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_431 = tensor.unpack %354 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%355 = tensor.empty() : tensor<4x768xf32>
%356 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%338, %unpack_431, %cst_110 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%355 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%357 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%356 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%358 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%357 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%359 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%356, %358 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%360 = tensor.empty() : tensor<4x768xf32>
%361 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%356, %359, %cst_52, %cst_51, %358 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%360 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_432 = tensor.pack %361 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%362 = linalg.mmt4d ins(%pack_432, %cst_159 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
%unpack_433 = tensor.unpack %362 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
%363 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_433, %cst_111 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.addf %in, %in_486 : f32
%381 = math.powf %380, %cst_5 : f32
%382 = arith.mulf %381, %cst_2 : f32
%383 = arith.addf %380, %382 : f32
%384 = arith.mulf %383, %cst_3 : f32
%385 = math.tanh %384 : f32
%386 = arith.addf %385, %cst_11 : f32
%387 = arith.mulf %380, %cst_4 : f32
%388 = arith.mulf %387, %386 : f32
linalg.yield %388 : f32
} -> tensor<4x3072xf32>
%pack_434 = tensor.pack %363 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
%364 = linalg.mmt4d ins(%pack_434, %cst_160 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
%unpack_435 = tensor.unpack %364 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
%365 = tensor.empty() : tensor<4x768xf32>
%366 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%356, %unpack_435, %cst_112 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%365 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
%380 = arith.addf %in_486, %in_487 : f32
%381 = arith.addf %in, %380 : f32
linalg.yield %381 : f32
} -> tensor<4x768xf32>
%367 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%366 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.addf %out, %in : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%368 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%367 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
^bb0(%in: f32, %out: f32):
%380 = arith.divf %in, %cst_9 : f32
linalg.yield %380 : f32
} -> tensor<4xf32>
%369 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%366, %368 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
^bb0(%in: f32, %in_486: f32, %out: f32):
%380 = arith.subf %in, %in_486 : f32
%381 = arith.mulf %380, %380 : f32
%382 = arith.addf %out, %381 : f32
linalg.yield %382 : f32
} -> tensor<4xf32>
%370 = tensor.empty() : tensor<4x768xf32>
%371 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%366, %369, %cst_14, %cst_13, %368 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%370 : tensor<4x768xf32>) {
^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
%380 = arith.divf %in_486, %cst_9 : f32
%381 = arith.addf %380, %cst_8 : f32
%382 = math.rsqrt %381 : f32
%383 = arith.mulf %382, %in_487 : f32
%384 = arith.mulf %in_489, %383 : f32
%385 = arith.subf %in_488, %384 : f32
%386 = arith.mulf %in, %383 : f32
%387 = arith.addf %386, %385 : f32
linalg.yield %387 : f32
} -> tensor<4x768xf32>
%pack_436 = tensor.pack %371 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
%372 = tensor.empty() : tensor<1x3142x4x16xf32>
%373 = linalg.fill ins(%cst_64 : f32) outs(%372 : tensor<1x3142x4x16xf32>) -> tensor<1x3142x4x16xf32>
%374 = linalg.mmt4d ins(%pack_436, %cst_161 : tensor<1x768x4x1xf32>, tensor<3142x768x16x1xf32>) outs(%373 : tensor<1x3142x4x16xf32>) -> tensor<1x3142x4x16xf32>
%375 = tensor.empty() : tensor<4x50257xf32>
%unpack_437 = tensor.unpack %374 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %375 : tensor<1x3142x4x16xf32> -> tensor<4x50257xf32>
%expanded_438 = tensor.expand_shape %unpack_437 [[0, 1], [2]] output_shape [1, 4, 50257] : tensor<4x50257xf32> into tensor<1x4x50257xf32>
%376 = tensor.empty() : tensor<2x1x12x4x64xf32>
%inserted_slice = tensor.insert_slice %347 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_439 = tensor.insert_slice %352 into %inserted_slice[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_440 = tensor.insert_slice %319 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_441 = tensor.insert_slice %324 into %inserted_slice_440[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_442 = tensor.insert_slice %291 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_443 = tensor.insert_slice %296 into %inserted_slice_442[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_444 = tensor.insert_slice %263 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_445 = tensor.insert_slice %268 into %inserted_slice_444[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_446 = tensor.insert_slice %235 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_447 = tensor.insert_slice %240 into %inserted_slice_446[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_448 = tensor.insert_slice %207 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_449 = tensor.insert_slice %212 into %inserted_slice_448[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_450 = tensor.insert_slice %179 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_451 = tensor.insert_slice %184 into %inserted_slice_450[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_452 = tensor.insert_slice %151 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_453 = tensor.insert_slice %156 into %inserted_slice_452[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_454 = tensor.insert_slice %123 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_455 = tensor.insert_slice %128 into %inserted_slice_454[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_456 = tensor.insert_slice %95 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_457 = tensor.insert_slice %100 into %inserted_slice_456[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_458 = tensor.insert_slice %67 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_459 = tensor.insert_slice %72 into %inserted_slice_458[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_460 = tensor.insert_slice %23 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%inserted_slice_461 = tensor.insert_slice %32 into %inserted_slice_460[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
%377 = tensor.empty() : tensor<12x2x1x12x4x64xf32>
%collapsed_462 = tensor.collapse_shape %inserted_slice_461 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_463 = tensor.insert_slice %collapsed_462 into %377[0, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_464 = tensor.collapse_shape %inserted_slice_459 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_465 = tensor.insert_slice %collapsed_464 into %inserted_slice_463[1, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_466 = tensor.collapse_shape %inserted_slice_457 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_467 = tensor.insert_slice %collapsed_466 into %inserted_slice_465[2, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_468 = tensor.collapse_shape %inserted_slice_455 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_469 = tensor.insert_slice %collapsed_468 into %inserted_slice_467[3, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_470 = tensor.collapse_shape %inserted_slice_453 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_471 = tensor.insert_slice %collapsed_470 into %inserted_slice_469[4, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_472 = tensor.collapse_shape %inserted_slice_451 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_473 = tensor.insert_slice %collapsed_472 into %inserted_slice_471[5, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_474 = tensor.collapse_shape %inserted_slice_449 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_475 = tensor.insert_slice %collapsed_474 into %inserted_slice_473[6, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_476 = tensor.collapse_shape %inserted_slice_447 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_477 = tensor.insert_slice %collapsed_476 into %inserted_slice_475[7, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_478 = tensor.collapse_shape %inserted_slice_445 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_479 = tensor.insert_slice %collapsed_478 into %inserted_slice_477[8, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_480 = tensor.collapse_shape %inserted_slice_443 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_481 = tensor.insert_slice %collapsed_480 into %inserted_slice_479[9, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_482 = tensor.collapse_shape %inserted_slice_441 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_483 = tensor.insert_slice %collapsed_482 into %inserted_slice_481[10, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%collapsed_484 = tensor.collapse_shape %inserted_slice_439 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
%inserted_slice_485 = tensor.insert_slice %collapsed_484 into %inserted_slice_483[11, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
%378 = hal.tensor.export %expanded_438 "output0" : tensor<1x4x50257xf32> -> !hal.buffer_view
%379 = hal.tensor.export %inserted_slice_485 "output1" : tensor<12x2x1x12x4x64xf32> -> !hal.buffer_view
util.return %378, %379 : !hal.buffer_view, !hal.buffer_view
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment