IanWood1/sinkreshapes-changeonly-before.mlir Secret

## sinkreshapes-changeonly-before.mlir
// -----// IR Dump After SinkReshapesPass (iree-flow-sink-reshapes) //----- //
util.func public @forward(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view) attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @forward(%input0: tensor<1x4xi32>, %input1: tensor<1x4xi32>) -> (%output0: tensor<1x4x50257xf32>, %output1: tensor<12x2x1x12x4x64xf32>)"}} {
  %cst = arith.constant dense_resource<__elided__> : tensor<4x4xf32>
  %cst_0 = arith.constant dense_resource<__elided__> : tensor<4x4xf32>
  %cst_1 = arith.constant dense_resource<__elided__> : tensor<1x4x768xf32>
  %cst_2 = arith.constant 4.471500e-02 : f32
  %cst_3 = arith.constant 0.797884583 : f32
  %cst_4 = arith.constant 5.000000e-01 : f32
  %cst_5 = arith.constant 3.000000e+00 : f32
  %cst_6 = arith.constant 9.99999971E-10 : f32
  %cst_7 = arith.constant 1.250000e-01 : f32
  %cst_8 = arith.constant 9.99999974E-6 : f32
  %cst_9 = arith.constant 7.680000e+02 : f32
  %cst_10 = arith.constant -1.000000e+04 : f32
  %cst_11 = arith.constant 1.000000e+00 : f32
  %cst_12 = arith.constant dense_resource<__elided__> : tensor<50257x768xf32>
  %cst_13 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_14 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_15 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_16 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_17 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_18 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_19 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_20 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_21 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_22 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_23 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_24 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_25 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_26 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_27 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_28 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_29 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_30 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_31 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_32 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_33 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_34 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_35 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_36 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_37 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_38 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_39 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_40 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_41 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_42 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_43 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_44 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_45 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_46 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_47 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_48 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_49 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_50 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_51 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_52 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_53 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_54 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_55 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_56 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_57 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_58 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_59 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_60 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_61 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_62 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_63 = arith.constant -0.000000e+00 : f32
  %cst_64 = arith.constant 0.000000e+00 : f32
  %cst_65 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_66 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_67 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_68 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_69 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_70 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_71 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_72 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_73 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_74 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_75 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_76 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_77 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_78 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_79 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_80 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_81 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_82 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_83 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_84 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_85 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_86 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_87 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_88 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_89 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_90 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_91 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_92 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_93 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_94 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_95 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_96 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_97 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_98 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_99 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_100 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_101 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_102 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_103 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_104 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_105 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_106 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_107 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_108 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_109 = arith.constant dense_resource<__elided__> : tensor<2304xf32>
  %cst_110 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_111 = arith.constant dense_resource<__elided__> : tensor<3072xf32>
  %cst_112 = arith.constant dense_resource<__elided__> : tensor<768xf32>
  %cst_113 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_114 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_115 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_116 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_117 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_118 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_119 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_120 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_121 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_122 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_123 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_124 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_125 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_126 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_127 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_128 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_129 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_130 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_131 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_132 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_133 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_134 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_135 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_136 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_137 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_138 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_139 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_140 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_141 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_142 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_143 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_144 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_145 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_146 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_147 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_148 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_149 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_150 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_151 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_152 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_153 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_154 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_155 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_156 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_157 = arith.constant dense_resource<__elided__> : tensor<144x768x16x1xf32>
  %cst_158 = arith.constant dense_resource<__elided__> : tensor<48x768x16x1xf32>
  %cst_159 = arith.constant dense_resource<__elided__> : tensor<192x768x16x1xf32>
  %cst_160 = arith.constant dense_resource<__elided__> : tensor<48x3072x16x1xf32>
  %cst_161 = arith.constant dense_resource<__elided__> : tensor<3142x768x16x1xf32>
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<1x4xi32>
  %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<1x4xi32>
  %2 = tensor.empty() : tensor<4xf32>
  %3 = tensor.empty() : tensor<1x4xf32>
  %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<1x4xi32>) outs(%3 : tensor<1x4xf32>) {
  ^bb0(%in: i32, %out: f32):
    %380 = arith.sitofp %in : i32 to f32
    %381 = arith.subf %cst_11, %380 : f32
    %382 = arith.mulf %381, %cst_10 : f32
    linalg.yield %382 : f32
  } -> tensor<1x4xf32>
  %5 = tensor.empty() : tensor<12x4x4xf32>
  %6 = tensor.empty() : tensor<4x768xf32>
  %7 = tensor.empty() : tensor<1x4x768xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %cst_1 : tensor<1x4xi32>, tensor<1x4x768xf32>) outs(%7 : tensor<1x4x768xf32>) {
  ^bb0(%in: i32, %in_486: f32, %out: f32):
    %380 = linalg.index 2 : index
    %381 = arith.index_cast %in : i32 to index
    %extracted = tensor.extract %cst_12[%381, %380] : tensor<50257x768xf32>
    %382 = arith.addf %extracted, %in_486 : f32
    linalg.yield %382 : f32
  } -> tensor<1x4x768xf32>
  %collapsed = tensor.collapse_shape %8 [[0, 1], [2]] : tensor<1x4x768xf32> into tensor<4x768xf32>
  %9 = linalg.fill ins(%cst_63 : f32) outs(%2 : tensor<4xf32>) -> tensor<4xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed, %11 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %collapsed_162 = tensor.collapse_shape %8 [[0, 1], [2]] : tensor<1x4x768xf32> into tensor<4x768xf32>
  %13 = tensor.empty() : tensor<4x768xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_162, %12, %cst_62, %cst_61, %11 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%13 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %15 = tensor.empty() : tensor<4x2304xf32>
  %16 = tensor.empty() : tensor<1x768x4x1xf32>
  %pack = tensor.pack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %17 = tensor.empty() : tensor<1x144x4x16xf32>
  %18 = linalg.fill ins(%cst_64 : f32) outs(%17 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %19 = linalg.mmt4d ins(%pack, %cst_113 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %20 = tensor.empty() : tensor<4x2304xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack, %cst_65 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%20 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded = tensor.expand_shape %21 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice = tensor.extract_slice %expanded[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_163 = tensor.extract_slice %expanded[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_164 = tensor.extract_slice %expanded[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_165 = tensor.expand_shape %extracted_slice [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %22 = tensor.empty() : tensor<12x4x64xf32>
  %expanded_166 = tensor.expand_shape %extracted_slice_163 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_166 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %24 = tensor.empty() : tensor<12x1x64x4x1xf32>
  %pack_167 = tensor.pack %expanded_165 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %25 = tensor.empty() : tensor<12x1x64x16x1xf32>
  %pack_168 = tensor.pack %expanded_166 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %26 = tensor.empty() : tensor<12x1x1x4x16xf32>
  %27 = linalg.fill ins(%cst_64 : f32) outs(%26 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %28 = linalg.batch_mmt4d ins(%pack_167, %pack_168 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_169 = tensor.unpack %28 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_170 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %29 = tensor.empty() : tensor<12x4x4xf32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_169, %cst_0, %cst, %collapsed_170 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%29 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %31 = linalg.softmax dimension(2) ins(%30 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_171 = tensor.expand_shape %extracted_slice_164 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_171 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %33 = tensor.empty() : tensor<12x1x4x4x1xf32>
  %pack_172 = tensor.pack %31 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %34 = tensor.empty() : tensor<12x4x4x16x1xf32>
  %pack_173 = tensor.pack %expanded_171 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %35 = tensor.empty() : tensor<12x1x4x4x16xf32>
  %36 = linalg.fill ins(%cst_64 : f32) outs(%35 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %37 = linalg.batch_mmt4d ins(%pack_172, %pack_173 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %38 = tensor.empty() : tensor<4x12x64xf32>
  %unpack_174 = tensor.unpack %37 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %39 = tensor.empty() : tensor<1x12x64x4x1xf32>
  %pack_175 = tensor.pack %unpack_174 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_176 = tensor.collapse_shape %pack_175 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %40 = tensor.empty() : tensor<1x48x4x16xf32>
  %41 = linalg.fill ins(%cst_64 : f32) outs(%40 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %42 = linalg.mmt4d ins(%collapsed_176, %cst_114 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_177 = tensor.unpack %42 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %collapsed_178 = tensor.collapse_shape %8 [[0, 1], [2]] : tensor<1x4x768xf32> into tensor<4x768xf32>
  %43 = tensor.empty() : tensor<4x768xf32>
  %44 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_178, %unpack_177, %cst_66 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%43 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%44 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %46 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%44, %46 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %48 = tensor.empty() : tensor<4x768xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %47, %cst_60, %cst_59, %46 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%48 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %50 = tensor.empty() : tensor<4x3072xf32>
  %pack_179 = tensor.pack %49 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %51 = tensor.empty() : tensor<1x192x4x16xf32>
  %52 = linalg.fill ins(%cst_64 : f32) outs(%51 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %53 = linalg.mmt4d ins(%pack_179, %cst_115 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_180 = tensor.unpack %53 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_180, %cst_67 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %55 = tensor.empty() : tensor<1x3072x4x1xf32>
  %pack_181 = tensor.pack %54 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %56 = linalg.mmt4d ins(%pack_181, %cst_116 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_182 = tensor.unpack %56 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %57 = tensor.empty() : tensor<4x768xf32>
  %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %unpack_182, %cst_68 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%57 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%58 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %60 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%59 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %61 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%58, %60 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %62 = tensor.empty() : tensor<4x768xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %61, %cst_50, %cst_49, %60 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%62 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_183 = tensor.pack %63 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %64 = linalg.mmt4d ins(%pack_183, %cst_117 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_184 = tensor.unpack %64 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %65 = tensor.empty() : tensor<4x2304xf32>
  %66 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_184, %cst_69 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%65 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_185 = tensor.expand_shape %66 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_186 = tensor.extract_slice %expanded_185[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_187 = tensor.extract_slice %expanded_185[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_188 = tensor.extract_slice %expanded_185[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_189 = tensor.expand_shape %extracted_slice_186 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_190 = tensor.expand_shape %extracted_slice_187 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_190 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_191 = tensor.pack %expanded_189 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_192 = tensor.pack %expanded_190 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %68 = linalg.batch_mmt4d ins(%pack_191, %pack_192 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_193 = tensor.unpack %68 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_194 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %69 = tensor.empty() : tensor<12x4x4xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_193, %cst_0, %cst, %collapsed_194 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%69 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %71 = linalg.softmax dimension(2) ins(%70 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_195 = tensor.expand_shape %extracted_slice_188 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_195 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_196 = tensor.pack %71 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_197 = tensor.pack %expanded_195 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %73 = linalg.batch_mmt4d ins(%pack_196, %pack_197 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_198 = tensor.unpack %73 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_199 = tensor.pack %unpack_198 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_200 = tensor.collapse_shape %pack_199 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %74 = linalg.mmt4d ins(%collapsed_200, %cst_118 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_201 = tensor.unpack %74 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %75 = tensor.empty() : tensor<4x768xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %unpack_201, %cst_70 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%75 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%76 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%76, %78 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %80 = tensor.empty() : tensor<4x768xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %79, %cst_48, %cst_47, %78 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%80 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_202 = tensor.pack %81 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %82 = linalg.mmt4d ins(%pack_202, %cst_119 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_203 = tensor.unpack %82 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_203, %cst_71 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_204 = tensor.pack %83 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %84 = linalg.mmt4d ins(%pack_204, %cst_120 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_205 = tensor.unpack %84 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %85 = tensor.empty() : tensor<4x768xf32>
  %86 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %unpack_205, %cst_72 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%85 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%86 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%87 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%86, %88 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %90 = tensor.empty() : tensor<4x768xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %89, %cst_46, %cst_45, %88 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%90 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_206 = tensor.pack %91 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %92 = linalg.mmt4d ins(%pack_206, %cst_121 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_207 = tensor.unpack %92 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %93 = tensor.empty() : tensor<4x2304xf32>
  %94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_207, %cst_73 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%93 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_208 = tensor.expand_shape %94 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_209 = tensor.extract_slice %expanded_208[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_210 = tensor.extract_slice %expanded_208[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_211 = tensor.extract_slice %expanded_208[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_212 = tensor.expand_shape %extracted_slice_209 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_213 = tensor.expand_shape %extracted_slice_210 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %95 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_213 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_214 = tensor.pack %expanded_212 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_215 = tensor.pack %expanded_213 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %96 = linalg.batch_mmt4d ins(%pack_214, %pack_215 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_216 = tensor.unpack %96 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_217 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %97 = tensor.empty() : tensor<12x4x4xf32>
  %98 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_216, %cst_0, %cst, %collapsed_217 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%97 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %99 = linalg.softmax dimension(2) ins(%98 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_218 = tensor.expand_shape %extracted_slice_211 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %100 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_218 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_219 = tensor.pack %99 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_220 = tensor.pack %expanded_218 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %101 = linalg.batch_mmt4d ins(%pack_219, %pack_220 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_221 = tensor.unpack %101 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_222 = tensor.pack %unpack_221 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_223 = tensor.collapse_shape %pack_222 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %102 = linalg.mmt4d ins(%collapsed_223, %cst_122 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_224 = tensor.unpack %102 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %103 = tensor.empty() : tensor<4x768xf32>
  %104 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %unpack_224, %cst_74 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%103 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%104 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %106 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%104, %106 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %108 = tensor.empty() : tensor<4x768xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%104, %107, %cst_44, %cst_43, %106 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%108 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_225 = tensor.pack %109 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %110 = linalg.mmt4d ins(%pack_225, %cst_123 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_226 = tensor.unpack %110 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_226, %cst_75 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_227 = tensor.pack %111 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %112 = linalg.mmt4d ins(%pack_227, %cst_124 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_228 = tensor.unpack %112 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %113 = tensor.empty() : tensor<4x768xf32>
  %114 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%104, %unpack_228, %cst_76 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%113 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %115 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%114 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %116 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%115 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %117 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%114, %116 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %118 = tensor.empty() : tensor<4x768xf32>
  %119 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %117, %cst_42, %cst_41, %116 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%118 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_229 = tensor.pack %119 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %120 = linalg.mmt4d ins(%pack_229, %cst_125 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_230 = tensor.unpack %120 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %121 = tensor.empty() : tensor<4x2304xf32>
  %122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_230, %cst_77 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%121 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_231 = tensor.expand_shape %122 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_232 = tensor.extract_slice %expanded_231[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_233 = tensor.extract_slice %expanded_231[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_234 = tensor.extract_slice %expanded_231[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_235 = tensor.expand_shape %extracted_slice_232 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_236 = tensor.expand_shape %extracted_slice_233 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_236 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_237 = tensor.pack %expanded_235 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_238 = tensor.pack %expanded_236 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %124 = linalg.batch_mmt4d ins(%pack_237, %pack_238 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_239 = tensor.unpack %124 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_240 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %125 = tensor.empty() : tensor<12x4x4xf32>
  %126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_239, %cst_0, %cst, %collapsed_240 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%125 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %127 = linalg.softmax dimension(2) ins(%126 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_241 = tensor.expand_shape %extracted_slice_234 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_241 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_242 = tensor.pack %127 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_243 = tensor.pack %expanded_241 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %129 = linalg.batch_mmt4d ins(%pack_242, %pack_243 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_244 = tensor.unpack %129 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_245 = tensor.pack %unpack_244 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_246 = tensor.collapse_shape %pack_245 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %130 = linalg.mmt4d ins(%collapsed_246, %cst_126 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_247 = tensor.unpack %130 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %131 = tensor.empty() : tensor<4x768xf32>
  %132 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %unpack_247, %cst_78 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%131 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %133 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%132 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %134 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%133 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %135 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%132, %134 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %136 = tensor.empty() : tensor<4x768xf32>
  %137 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%132, %135, %cst_40, %cst_39, %134 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%136 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_248 = tensor.pack %137 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %138 = linalg.mmt4d ins(%pack_248, %cst_127 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_249 = tensor.unpack %138 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %139 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_249, %cst_79 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_250 = tensor.pack %139 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %140 = linalg.mmt4d ins(%pack_250, %cst_128 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_251 = tensor.unpack %140 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %141 = tensor.empty() : tensor<4x768xf32>
  %142 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%132, %unpack_251, %cst_80 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%141 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %143 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%142 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %144 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%143 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %145 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%142, %144 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %146 = tensor.empty() : tensor<4x768xf32>
  %147 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%142, %145, %cst_38, %cst_37, %144 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%146 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_252 = tensor.pack %147 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %148 = linalg.mmt4d ins(%pack_252, %cst_129 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_253 = tensor.unpack %148 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %149 = tensor.empty() : tensor<4x2304xf32>
  %150 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_253, %cst_81 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%149 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_254 = tensor.expand_shape %150 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_255 = tensor.extract_slice %expanded_254[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_256 = tensor.extract_slice %expanded_254[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_257 = tensor.extract_slice %expanded_254[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_258 = tensor.expand_shape %extracted_slice_255 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_259 = tensor.expand_shape %extracted_slice_256 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %151 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_259 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_260 = tensor.pack %expanded_258 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_261 = tensor.pack %expanded_259 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %152 = linalg.batch_mmt4d ins(%pack_260, %pack_261 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_262 = tensor.unpack %152 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_263 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %153 = tensor.empty() : tensor<12x4x4xf32>
  %154 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_262, %cst_0, %cst, %collapsed_263 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%153 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %155 = linalg.softmax dimension(2) ins(%154 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_264 = tensor.expand_shape %extracted_slice_257 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %156 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_264 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_265 = tensor.pack %155 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_266 = tensor.pack %expanded_264 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %157 = linalg.batch_mmt4d ins(%pack_265, %pack_266 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_267 = tensor.unpack %157 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_268 = tensor.pack %unpack_267 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_269 = tensor.collapse_shape %pack_268 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %158 = linalg.mmt4d ins(%collapsed_269, %cst_130 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_270 = tensor.unpack %158 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %159 = tensor.empty() : tensor<4x768xf32>
  %160 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%142, %unpack_270, %cst_82 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%159 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %161 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%160 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %162 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%161 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %163 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%160, %162 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %164 = tensor.empty() : tensor<4x768xf32>
  %165 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%160, %163, %cst_36, %cst_35, %162 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%164 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_271 = tensor.pack %165 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %166 = linalg.mmt4d ins(%pack_271, %cst_131 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_272 = tensor.unpack %166 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %167 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_272, %cst_83 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_273 = tensor.pack %167 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %168 = linalg.mmt4d ins(%pack_273, %cst_132 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_274 = tensor.unpack %168 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %169 = tensor.empty() : tensor<4x768xf32>
  %170 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%160, %unpack_274, %cst_84 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%169 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %171 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%170 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %172 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%171 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %173 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%170, %172 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %174 = tensor.empty() : tensor<4x768xf32>
  %175 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%170, %173, %cst_34, %cst_33, %172 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%174 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_275 = tensor.pack %175 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %176 = linalg.mmt4d ins(%pack_275, %cst_133 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_276 = tensor.unpack %176 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %177 = tensor.empty() : tensor<4x2304xf32>
  %178 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_276, %cst_85 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%177 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_277 = tensor.expand_shape %178 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_278 = tensor.extract_slice %expanded_277[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_279 = tensor.extract_slice %expanded_277[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_280 = tensor.extract_slice %expanded_277[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_281 = tensor.expand_shape %extracted_slice_278 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_282 = tensor.expand_shape %extracted_slice_279 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %179 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_282 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_283 = tensor.pack %expanded_281 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_284 = tensor.pack %expanded_282 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %180 = linalg.batch_mmt4d ins(%pack_283, %pack_284 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_285 = tensor.unpack %180 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_286 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %181 = tensor.empty() : tensor<12x4x4xf32>
  %182 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_285, %cst_0, %cst, %collapsed_286 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%181 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %183 = linalg.softmax dimension(2) ins(%182 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_287 = tensor.expand_shape %extracted_slice_280 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %184 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_287 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_288 = tensor.pack %183 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_289 = tensor.pack %expanded_287 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %185 = linalg.batch_mmt4d ins(%pack_288, %pack_289 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_290 = tensor.unpack %185 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_291 = tensor.pack %unpack_290 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_292 = tensor.collapse_shape %pack_291 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %186 = linalg.mmt4d ins(%collapsed_292, %cst_134 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_293 = tensor.unpack %186 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %187 = tensor.empty() : tensor<4x768xf32>
  %188 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%170, %unpack_293, %cst_86 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%187 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %189 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%188 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %190 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %191 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%188, %190 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %192 = tensor.empty() : tensor<4x768xf32>
  %193 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%188, %191, %cst_32, %cst_31, %190 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%192 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_294 = tensor.pack %193 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %194 = linalg.mmt4d ins(%pack_294, %cst_135 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_295 = tensor.unpack %194 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %195 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_295, %cst_87 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_296 = tensor.pack %195 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %196 = linalg.mmt4d ins(%pack_296, %cst_136 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_297 = tensor.unpack %196 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %197 = tensor.empty() : tensor<4x768xf32>
  %198 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%188, %unpack_297, %cst_88 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%197 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %199 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%198 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %200 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%199 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %201 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%198, %200 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %202 = tensor.empty() : tensor<4x768xf32>
  %203 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%198, %201, %cst_30, %cst_29, %200 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%202 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_298 = tensor.pack %203 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %204 = linalg.mmt4d ins(%pack_298, %cst_137 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_299 = tensor.unpack %204 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %205 = tensor.empty() : tensor<4x2304xf32>
  %206 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_299, %cst_89 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%205 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_300 = tensor.expand_shape %206 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_301 = tensor.extract_slice %expanded_300[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_302 = tensor.extract_slice %expanded_300[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_303 = tensor.extract_slice %expanded_300[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_304 = tensor.expand_shape %extracted_slice_301 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_305 = tensor.expand_shape %extracted_slice_302 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %207 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_305 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_306 = tensor.pack %expanded_304 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_307 = tensor.pack %expanded_305 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %208 = linalg.batch_mmt4d ins(%pack_306, %pack_307 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_308 = tensor.unpack %208 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_309 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %209 = tensor.empty() : tensor<12x4x4xf32>
  %210 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_308, %cst_0, %cst, %collapsed_309 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%209 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %211 = linalg.softmax dimension(2) ins(%210 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_310 = tensor.expand_shape %extracted_slice_303 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %212 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_310 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_311 = tensor.pack %211 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_312 = tensor.pack %expanded_310 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %213 = linalg.batch_mmt4d ins(%pack_311, %pack_312 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_313 = tensor.unpack %213 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_314 = tensor.pack %unpack_313 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_315 = tensor.collapse_shape %pack_314 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %214 = linalg.mmt4d ins(%collapsed_315, %cst_138 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_316 = tensor.unpack %214 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %215 = tensor.empty() : tensor<4x768xf32>
  %216 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%198, %unpack_316, %cst_90 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%215 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %217 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%216 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %218 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%217 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %219 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%216, %218 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %220 = tensor.empty() : tensor<4x768xf32>
  %221 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%216, %219, %cst_28, %cst_27, %218 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%220 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_317 = tensor.pack %221 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %222 = linalg.mmt4d ins(%pack_317, %cst_139 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_318 = tensor.unpack %222 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %223 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_318, %cst_91 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_319 = tensor.pack %223 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %224 = linalg.mmt4d ins(%pack_319, %cst_140 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_320 = tensor.unpack %224 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %225 = tensor.empty() : tensor<4x768xf32>
  %226 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%216, %unpack_320, %cst_92 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%225 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %227 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%226 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %228 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%227 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %229 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%226, %228 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %230 = tensor.empty() : tensor<4x768xf32>
  %231 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%226, %229, %cst_26, %cst_25, %228 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%230 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_321 = tensor.pack %231 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %232 = linalg.mmt4d ins(%pack_321, %cst_141 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_322 = tensor.unpack %232 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %233 = tensor.empty() : tensor<4x2304xf32>
  %234 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_322, %cst_93 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%233 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_323 = tensor.expand_shape %234 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_324 = tensor.extract_slice %expanded_323[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_325 = tensor.extract_slice %expanded_323[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_326 = tensor.extract_slice %expanded_323[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_327 = tensor.expand_shape %extracted_slice_324 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_328 = tensor.expand_shape %extracted_slice_325 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %235 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_328 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_329 = tensor.pack %expanded_327 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_330 = tensor.pack %expanded_328 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %236 = linalg.batch_mmt4d ins(%pack_329, %pack_330 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_331 = tensor.unpack %236 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_332 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %237 = tensor.empty() : tensor<12x4x4xf32>
  %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_331, %cst_0, %cst, %collapsed_332 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%237 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %239 = linalg.softmax dimension(2) ins(%238 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_333 = tensor.expand_shape %extracted_slice_326 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %240 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_333 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_334 = tensor.pack %239 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_335 = tensor.pack %expanded_333 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %241 = linalg.batch_mmt4d ins(%pack_334, %pack_335 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_336 = tensor.unpack %241 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_337 = tensor.pack %unpack_336 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_338 = tensor.collapse_shape %pack_337 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %242 = linalg.mmt4d ins(%collapsed_338, %cst_142 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_339 = tensor.unpack %242 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %243 = tensor.empty() : tensor<4x768xf32>
  %244 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%226, %unpack_339, %cst_94 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%243 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %245 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%244 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %246 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%245 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %247 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%244, %246 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %248 = tensor.empty() : tensor<4x768xf32>
  %249 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%244, %247, %cst_24, %cst_23, %246 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%248 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_340 = tensor.pack %249 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %250 = linalg.mmt4d ins(%pack_340, %cst_143 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_341 = tensor.unpack %250 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %251 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_341, %cst_95 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_342 = tensor.pack %251 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %252 = linalg.mmt4d ins(%pack_342, %cst_144 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_343 = tensor.unpack %252 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %253 = tensor.empty() : tensor<4x768xf32>
  %254 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%244, %unpack_343, %cst_96 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%253 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %255 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%254 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %256 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%255 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %257 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%254, %256 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %258 = tensor.empty() : tensor<4x768xf32>
  %259 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%254, %257, %cst_22, %cst_21, %256 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%258 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_344 = tensor.pack %259 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %260 = linalg.mmt4d ins(%pack_344, %cst_145 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_345 = tensor.unpack %260 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %261 = tensor.empty() : tensor<4x2304xf32>
  %262 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_345, %cst_97 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%261 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_346 = tensor.expand_shape %262 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_347 = tensor.extract_slice %expanded_346[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_348 = tensor.extract_slice %expanded_346[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_349 = tensor.extract_slice %expanded_346[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_350 = tensor.expand_shape %extracted_slice_347 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_351 = tensor.expand_shape %extracted_slice_348 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %263 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_351 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_352 = tensor.pack %expanded_350 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_353 = tensor.pack %expanded_351 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %264 = linalg.batch_mmt4d ins(%pack_352, %pack_353 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_354 = tensor.unpack %264 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_355 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %265 = tensor.empty() : tensor<12x4x4xf32>
  %266 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_354, %cst_0, %cst, %collapsed_355 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%265 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %267 = linalg.softmax dimension(2) ins(%266 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_356 = tensor.expand_shape %extracted_slice_349 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %268 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_356 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_357 = tensor.pack %267 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_358 = tensor.pack %expanded_356 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %269 = linalg.batch_mmt4d ins(%pack_357, %pack_358 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_359 = tensor.unpack %269 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_360 = tensor.pack %unpack_359 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_361 = tensor.collapse_shape %pack_360 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %270 = linalg.mmt4d ins(%collapsed_361, %cst_146 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_362 = tensor.unpack %270 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %271 = tensor.empty() : tensor<4x768xf32>
  %272 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%254, %unpack_362, %cst_98 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%271 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %273 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%272 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %274 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%273 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %275 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%272, %274 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %276 = tensor.empty() : tensor<4x768xf32>
  %277 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%272, %275, %cst_20, %cst_19, %274 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%276 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_363 = tensor.pack %277 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %278 = linalg.mmt4d ins(%pack_363, %cst_147 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_364 = tensor.unpack %278 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %279 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_364, %cst_99 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_365 = tensor.pack %279 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %280 = linalg.mmt4d ins(%pack_365, %cst_148 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_366 = tensor.unpack %280 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %281 = tensor.empty() : tensor<4x768xf32>
  %282 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%272, %unpack_366, %cst_100 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%281 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %283 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%282 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %284 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%283 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %285 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%282, %284 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %286 = tensor.empty() : tensor<4x768xf32>
  %287 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%282, %285, %cst_18, %cst_17, %284 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%286 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_367 = tensor.pack %287 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %288 = linalg.mmt4d ins(%pack_367, %cst_149 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_368 = tensor.unpack %288 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %289 = tensor.empty() : tensor<4x2304xf32>
  %290 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_368, %cst_101 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%289 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_369 = tensor.expand_shape %290 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_370 = tensor.extract_slice %expanded_369[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_371 = tensor.extract_slice %expanded_369[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_372 = tensor.extract_slice %expanded_369[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_373 = tensor.expand_shape %extracted_slice_370 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_374 = tensor.expand_shape %extracted_slice_371 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %291 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_374 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_375 = tensor.pack %expanded_373 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_376 = tensor.pack %expanded_374 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %292 = linalg.batch_mmt4d ins(%pack_375, %pack_376 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_377 = tensor.unpack %292 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_378 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %293 = tensor.empty() : tensor<12x4x4xf32>
  %294 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_377, %cst_0, %cst, %collapsed_378 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%293 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %295 = linalg.softmax dimension(2) ins(%294 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_379 = tensor.expand_shape %extracted_slice_372 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_379 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_380 = tensor.pack %295 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_381 = tensor.pack %expanded_379 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %297 = linalg.batch_mmt4d ins(%pack_380, %pack_381 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_382 = tensor.unpack %297 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_383 = tensor.pack %unpack_382 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_384 = tensor.collapse_shape %pack_383 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %298 = linalg.mmt4d ins(%collapsed_384, %cst_150 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_385 = tensor.unpack %298 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %299 = tensor.empty() : tensor<4x768xf32>
  %300 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%282, %unpack_385, %cst_102 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%299 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %301 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%300 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %302 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%301 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %303 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%300, %302 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %304 = tensor.empty() : tensor<4x768xf32>
  %305 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%300, %303, %cst_16, %cst_15, %302 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%304 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_386 = tensor.pack %305 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %306 = linalg.mmt4d ins(%pack_386, %cst_151 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_387 = tensor.unpack %306 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %307 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_387, %cst_103 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_388 = tensor.pack %307 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %308 = linalg.mmt4d ins(%pack_388, %cst_152 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_389 = tensor.unpack %308 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %309 = tensor.empty() : tensor<4x768xf32>
  %310 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%300, %unpack_389, %cst_104 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%309 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %311 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%310 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %312 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%311 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %313 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%310, %312 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %314 = tensor.empty() : tensor<4x768xf32>
  %315 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%310, %313, %cst_58, %cst_57, %312 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%314 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_390 = tensor.pack %315 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %316 = linalg.mmt4d ins(%pack_390, %cst_153 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_391 = tensor.unpack %316 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %317 = tensor.empty() : tensor<4x2304xf32>
  %318 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_391, %cst_105 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%317 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_392 = tensor.expand_shape %318 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_393 = tensor.extract_slice %expanded_392[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_394 = tensor.extract_slice %expanded_392[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_395 = tensor.extract_slice %expanded_392[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_396 = tensor.expand_shape %extracted_slice_393 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_397 = tensor.expand_shape %extracted_slice_394 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %319 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_397 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_398 = tensor.pack %expanded_396 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_399 = tensor.pack %expanded_397 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %320 = linalg.batch_mmt4d ins(%pack_398, %pack_399 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_400 = tensor.unpack %320 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_401 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %321 = tensor.empty() : tensor<12x4x4xf32>
  %322 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_400, %cst_0, %cst, %collapsed_401 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%321 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %323 = linalg.softmax dimension(2) ins(%322 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_402 = tensor.expand_shape %extracted_slice_395 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %324 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_402 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_403 = tensor.pack %323 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_404 = tensor.pack %expanded_402 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %325 = linalg.batch_mmt4d ins(%pack_403, %pack_404 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_405 = tensor.unpack %325 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_406 = tensor.pack %unpack_405 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_407 = tensor.collapse_shape %pack_406 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %326 = linalg.mmt4d ins(%collapsed_407, %cst_154 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_408 = tensor.unpack %326 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %327 = tensor.empty() : tensor<4x768xf32>
  %328 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%310, %unpack_408, %cst_106 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%327 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %329 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%328 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %330 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%329 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %331 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%328, %330 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %332 = tensor.empty() : tensor<4x768xf32>
  %333 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%328, %331, %cst_56, %cst_55, %330 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%332 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_409 = tensor.pack %333 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %334 = linalg.mmt4d ins(%pack_409, %cst_155 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_410 = tensor.unpack %334 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %335 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_410, %cst_107 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_411 = tensor.pack %335 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %336 = linalg.mmt4d ins(%pack_411, %cst_156 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_412 = tensor.unpack %336 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %337 = tensor.empty() : tensor<4x768xf32>
  %338 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%328, %unpack_412, %cst_108 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%337 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %339 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%338 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %340 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%339 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %341 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%338, %340 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %342 = tensor.empty() : tensor<4x768xf32>
  %343 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%338, %341, %cst_54, %cst_53, %340 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%342 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_413 = tensor.pack %343 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %344 = linalg.mmt4d ins(%pack_413, %cst_157 : tensor<1x768x4x1xf32>, tensor<144x768x16x1xf32>) outs(%18 : tensor<1x144x4x16xf32>) -> tensor<1x144x4x16xf32>
  %unpack_414 = tensor.unpack %344 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %15 : tensor<1x144x4x16xf32> -> tensor<4x2304xf32>
  %345 = tensor.empty() : tensor<4x2304xf32>
  %346 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_414, %cst_109 : tensor<4x2304xf32>, tensor<2304xf32>) outs(%345 : tensor<4x2304xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    linalg.yield %380 : f32
  } -> tensor<4x2304xf32>
  %expanded_415 = tensor.expand_shape %346 [[0, 1], [2]] output_shape [1, 4, 2304] : tensor<4x2304xf32> into tensor<1x4x2304xf32>
  %extracted_slice_416 = tensor.extract_slice %expanded_415[0, 0, 0] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_417 = tensor.extract_slice %expanded_415[0, 0, 768] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %extracted_slice_418 = tensor.extract_slice %expanded_415[0, 0, 1536] [1, 4, 768] [1, 1, 1] : tensor<1x4x2304xf32> to tensor<4x768xf32>
  %expanded_419 = tensor.expand_shape %extracted_slice_416 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %expanded_420 = tensor.expand_shape %extracted_slice_417 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %347 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_420 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_421 = tensor.pack %expanded_419 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %24 : tensor<4x12x64xf32> -> tensor<12x1x64x4x1xf32>
  %pack_422 = tensor.pack %expanded_420 padding_value(%cst_64 : f32) outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [16, 1] into %25 : tensor<4x12x64xf32> -> tensor<12x1x64x16x1xf32>
  %348 = linalg.batch_mmt4d ins(%pack_421, %pack_422 : tensor<12x1x64x4x1xf32>, tensor<12x1x64x16x1xf32>) outs(%27 : tensor<12x1x1x4x16xf32>) -> tensor<12x1x1x4x16xf32>
  %unpack_423 = tensor.unpack %348 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 16] into %5 : tensor<12x1x1x4x16xf32> -> tensor<12x4x4xf32>
  %collapsed_424 = tensor.collapse_shape %4 [[0, 1]] : tensor<1x4xf32> into tensor<4xf32>
  %349 = tensor.empty() : tensor<12x4x4xf32>
  %350 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack_423, %cst_0, %cst, %collapsed_424 : tensor<12x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>) outs(%349 : tensor<12x4x4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %out: f32):
    %380 = arith.mulf %in, %cst_7 : f32
    %381 = arith.mulf %380, %in_486 : f32
    %382 = arith.subf %381, %in_487 : f32
    %383 = arith.addf %382, %in_488 : f32
    %384 = arith.addf %383, %cst_6 : f32
    linalg.yield %384 : f32
  } -> tensor<12x4x4xf32>
  %351 = linalg.softmax dimension(2) ins(%350 : tensor<12x4x4xf32>) outs(%5 : tensor<12x4x4xf32>) -> tensor<12x4x4xf32>
  %expanded_425 = tensor.expand_shape %extracted_slice_418 [[0], [1, 2]] output_shape [4, 12, 64] : tensor<4x768xf32> into tensor<4x12x64xf32>
  %352 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_425 : tensor<4x12x64xf32>) outs(%22 : tensor<12x4x64xf32>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  } -> tensor<12x4x64xf32>
  %pack_426 = tensor.pack %351 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 1] into %33 : tensor<12x4x4xf32> -> tensor<12x1x4x4x1xf32>
  %pack_427 = tensor.pack %expanded_425 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %34 : tensor<4x12x64xf32> -> tensor<12x4x4x16x1xf32>
  %353 = linalg.batch_mmt4d ins(%pack_426, %pack_427 : tensor<12x1x4x4x1xf32>, tensor<12x4x4x16x1xf32>) outs(%36 : tensor<12x1x4x4x16xf32>) -> tensor<12x1x4x4x16xf32>
  %unpack_428 = tensor.unpack %353 outer_dims_perm = [1, 0, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 16] into %38 : tensor<12x1x4x4x16xf32> -> tensor<4x12x64xf32>
  %pack_429 = tensor.pack %unpack_428 outer_dims_perm = [0, 1, 2] inner_dims_pos = [0, 2] inner_tiles = [4, 1] into %39 : tensor<4x12x64xf32> -> tensor<1x12x64x4x1xf32>
  %collapsed_430 = tensor.collapse_shape %pack_429 [[0], [1, 2], [3], [4]] : tensor<1x12x64x4x1xf32> into tensor<1x768x4x1xf32>
  %354 = linalg.mmt4d ins(%collapsed_430, %cst_158 : tensor<1x768x4x1xf32>, tensor<48x768x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_431 = tensor.unpack %354 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %355 = tensor.empty() : tensor<4x768xf32>
  %356 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%338, %unpack_431, %cst_110 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%355 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %357 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%356 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %358 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%357 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %359 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%356, %358 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %360 = tensor.empty() : tensor<4x768xf32>
  %361 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%356, %359, %cst_52, %cst_51, %358 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%360 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_432 = tensor.pack %361 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %362 = linalg.mmt4d ins(%pack_432, %cst_159 : tensor<1x768x4x1xf32>, tensor<192x768x16x1xf32>) outs(%52 : tensor<1x192x4x16xf32>) -> tensor<1x192x4x16xf32>
  %unpack_433 = tensor.unpack %362 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %50 : tensor<1x192x4x16xf32> -> tensor<4x3072xf32>
  %363 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack_433, %cst_111 : tensor<4x3072xf32>, tensor<3072xf32>) outs(%50 : tensor<4x3072xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.addf %in, %in_486 : f32
    %381 = math.powf %380, %cst_5 : f32
    %382 = arith.mulf %381, %cst_2 : f32
    %383 = arith.addf %380, %382 : f32
    %384 = arith.mulf %383, %cst_3 : f32
    %385 = math.tanh %384 : f32
    %386 = arith.addf %385, %cst_11 : f32
    %387 = arith.mulf %380, %cst_4 : f32
    %388 = arith.mulf %387, %386 : f32
    linalg.yield %388 : f32
  } -> tensor<4x3072xf32>
  %pack_434 = tensor.pack %363 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %55 : tensor<4x3072xf32> -> tensor<1x3072x4x1xf32>
  %364 = linalg.mmt4d ins(%pack_434, %cst_160 : tensor<1x3072x4x1xf32>, tensor<48x3072x16x1xf32>) outs(%41 : tensor<1x48x4x16xf32>) -> tensor<1x48x4x16xf32>
  %unpack_435 = tensor.unpack %364 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %6 : tensor<1x48x4x16xf32> -> tensor<4x768xf32>
  %365 = tensor.empty() : tensor<4x768xf32>
  %366 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%356, %unpack_435, %cst_112 : tensor<4x768xf32>, tensor<4x768xf32>, tensor<768xf32>) outs(%365 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %out: f32):
    %380 = arith.addf %in_486, %in_487 : f32
    %381 = arith.addf %in, %380 : f32
    linalg.yield %381 : f32
  } -> tensor<4x768xf32>
  %367 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%366 : tensor<4x768xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.addf %out, %in : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %368 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%367 : tensor<4xf32>) outs(%2 : tensor<4xf32>) {
  ^bb0(%in: f32, %out: f32):
    %380 = arith.divf %in, %cst_9 : f32
    linalg.yield %380 : f32
  } -> tensor<4xf32>
  %369 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%366, %368 : tensor<4x768xf32>, tensor<4xf32>) outs(%9 : tensor<4xf32>) {
  ^bb0(%in: f32, %in_486: f32, %out: f32):
    %380 = arith.subf %in, %in_486 : f32
    %381 = arith.mulf %380, %380 : f32
    %382 = arith.addf %out, %381 : f32
    linalg.yield %382 : f32
  } -> tensor<4xf32>
  %370 = tensor.empty() : tensor<4x768xf32>
  %371 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%366, %369, %cst_14, %cst_13, %368 : tensor<4x768xf32>, tensor<4xf32>, tensor<768xf32>, tensor<768xf32>, tensor<4xf32>) outs(%370 : tensor<4x768xf32>) {
  ^bb0(%in: f32, %in_486: f32, %in_487: f32, %in_488: f32, %in_489: f32, %out: f32):
    %380 = arith.divf %in_486, %cst_9 : f32
    %381 = arith.addf %380, %cst_8 : f32
    %382 = math.rsqrt %381 : f32
    %383 = arith.mulf %382, %in_487 : f32
    %384 = arith.mulf %in_489, %383 : f32
    %385 = arith.subf %in_488, %384 : f32
    %386 = arith.mulf %in, %383 : f32
    %387 = arith.addf %386, %385 : f32
    linalg.yield %387 : f32
  } -> tensor<4x768xf32>
  %pack_436 = tensor.pack %371 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 1] into %16 : tensor<4x768xf32> -> tensor<1x768x4x1xf32>
  %372 = tensor.empty() : tensor<1x3142x4x16xf32>
  %373 = linalg.fill ins(%cst_64 : f32) outs(%372 : tensor<1x3142x4x16xf32>) -> tensor<1x3142x4x16xf32>
  %374 = linalg.mmt4d ins(%pack_436, %cst_161 : tensor<1x768x4x1xf32>, tensor<3142x768x16x1xf32>) outs(%373 : tensor<1x3142x4x16xf32>) -> tensor<1x3142x4x16xf32>
  %375 = tensor.empty() : tensor<4x50257xf32>
  %unpack_437 = tensor.unpack %374 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %375 : tensor<1x3142x4x16xf32> -> tensor<4x50257xf32>
  %expanded_438 = tensor.expand_shape %unpack_437 [[0, 1], [2]] output_shape [1, 4, 50257] : tensor<4x50257xf32> into tensor<1x4x50257xf32>
  %376 = tensor.empty() : tensor<2x1x12x4x64xf32>
  %inserted_slice = tensor.insert_slice %347 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_439 = tensor.insert_slice %352 into %inserted_slice[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_440 = tensor.insert_slice %319 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_441 = tensor.insert_slice %324 into %inserted_slice_440[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_442 = tensor.insert_slice %291 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_443 = tensor.insert_slice %296 into %inserted_slice_442[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_444 = tensor.insert_slice %263 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_445 = tensor.insert_slice %268 into %inserted_slice_444[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_446 = tensor.insert_slice %235 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_447 = tensor.insert_slice %240 into %inserted_slice_446[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_448 = tensor.insert_slice %207 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_449 = tensor.insert_slice %212 into %inserted_slice_448[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_450 = tensor.insert_slice %179 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_451 = tensor.insert_slice %184 into %inserted_slice_450[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_452 = tensor.insert_slice %151 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_453 = tensor.insert_slice %156 into %inserted_slice_452[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_454 = tensor.insert_slice %123 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_455 = tensor.insert_slice %128 into %inserted_slice_454[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_456 = tensor.insert_slice %95 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_457 = tensor.insert_slice %100 into %inserted_slice_456[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_458 = tensor.insert_slice %67 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_459 = tensor.insert_slice %72 into %inserted_slice_458[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_460 = tensor.insert_slice %23 into %376[0, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %inserted_slice_461 = tensor.insert_slice %32 into %inserted_slice_460[1, 0, 0, 0, 0] [1, 1, 12, 4, 64] [1, 1, 1, 1, 1] : tensor<12x4x64xf32> into tensor<2x1x12x4x64xf32>
  %377 = tensor.empty() : tensor<12x2x1x12x4x64xf32>
  %collapsed_462 = tensor.collapse_shape %inserted_slice_461 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_463 = tensor.insert_slice %collapsed_462 into %377[0, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_464 = tensor.collapse_shape %inserted_slice_459 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_465 = tensor.insert_slice %collapsed_464 into %inserted_slice_463[1, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_466 = tensor.collapse_shape %inserted_slice_457 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_467 = tensor.insert_slice %collapsed_466 into %inserted_slice_465[2, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_468 = tensor.collapse_shape %inserted_slice_455 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_469 = tensor.insert_slice %collapsed_468 into %inserted_slice_467[3, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_470 = tensor.collapse_shape %inserted_slice_453 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_471 = tensor.insert_slice %collapsed_470 into %inserted_slice_469[4, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_472 = tensor.collapse_shape %inserted_slice_451 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_473 = tensor.insert_slice %collapsed_472 into %inserted_slice_471[5, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_474 = tensor.collapse_shape %inserted_slice_449 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_475 = tensor.insert_slice %collapsed_474 into %inserted_slice_473[6, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_476 = tensor.collapse_shape %inserted_slice_447 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_477 = tensor.insert_slice %collapsed_476 into %inserted_slice_475[7, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_478 = tensor.collapse_shape %inserted_slice_445 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_479 = tensor.insert_slice %collapsed_478 into %inserted_slice_477[8, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_480 = tensor.collapse_shape %inserted_slice_443 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_481 = tensor.insert_slice %collapsed_480 into %inserted_slice_479[9, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_482 = tensor.collapse_shape %inserted_slice_441 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_483 = tensor.insert_slice %collapsed_482 into %inserted_slice_481[10, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %collapsed_484 = tensor.collapse_shape %inserted_slice_439 [[0], [1, 2], [3], [4]] : tensor<2x1x12x4x64xf32> into tensor<2x12x4x64xf32>
  %inserted_slice_485 = tensor.insert_slice %collapsed_484 into %inserted_slice_483[11, 0, 0, 0, 0, 0] [1, 2, 1, 12, 4, 64] [1, 1, 1, 1, 1, 1] : tensor<2x12x4x64xf32> into tensor<12x2x1x12x4x64xf32>
  %378 = hal.tensor.export %expanded_438 "output0" : tensor<1x4x50257xf32> -> !hal.buffer_view
  %379 = hal.tensor.export %inserted_slice_485 "output1" : tensor<12x2x1x12x4x64xf32> -> !hal.buffer_view
  util.return %378, %379 : !hal.buffer_view, !hal.buffer_view
}