vivekkhandelwal1/error_after_tensor_reshape_folding.mlir

## error_after_tensor_reshape_folding.mlir

bert_input_ir.mlir:1133:12: error: 'linalg.generic' op unexpected result less than 0 at expression #0 in (d0, d1, d2, d3) -> (d0, d3)
    %404 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%401, %96 : tensor<?x12x?x?xf32>, tensor<?x1x1x?xf32>) outs(%398 : tensor<?x12x?x?xf32>) {
           ^
bert_input_ir.mlir:20:3: note: called from
  func @forward(%arg0: tensor<1x512xi64>) -> tensor<?x2xf32> {
  ^
bert_input_ir.mlir:1133:12: note: see current operation: %333 = "linalg.generic"(%332, %263, %330) ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %600 = "arith.truncf"(%9) : (f64) -> f32
  %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
  %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
  "linalg.yield"(%602) : (f32) -> ()
}) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<1x?xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %404 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%401, %96 : tensor<?x12x?x?xf32>, tensor<?x1x1x?xf32>) outs(%398 : tensor<?x12x?x?xf32>) {
           ^
// -----// IR Dump After Canonicalizer Failed //----- //
"builtin.func"() ({
^bb0(%arg0: !hal.buffer_view):
  %0 = "arith.constant"() {value = 1 : index} : () -> index
  %1 = "arith.constant"() {value = false} : () -> i1
  %2 = "arith.constant"() {value = 0 : index} : () -> index
  %3 = "arith.constant"() {value = 512 : i64} : () -> i64
  %4 = "arith.constant"() {value = 0 : i64} : () -> i64
  %5 = "arith.constant"() {value = 512 : index} : () -> index
  %6 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x2xf32>} : () -> tensor<384x2xf32>
  %7 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %8 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
  %9 = "arith.constant"() {value = 5.6568542494923806 : f64} : () -> f64
  %10 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
  %11 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
  %12 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
  %13 = "arith.constant"() {value = -1.000000e+04 : f32} : () -> f32
  %14 = "arith.constant"() {value = 30522 : i64} : () -> i64
  %15 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
  %16 = "arith.constant"() {value = -3.40282347E+38 : f32} : () -> f32
  %17 = "arith.constant"() {value = 5.000000e-01 : f32} : () -> f32
  %18 = "arith.constant"() {value = 2.000000e+00 : f32} : () -> f32
  %19 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
  %20 = "arith.constant"() {value = 9223372036854775807 : i64} : () -> i64
  %21 = "arith.constant"() {value = 9.9999999999999998E-13 : f64} : () -> f64
  %22 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x512xi64>} : () -> tensor<1x512xi64>
  %23 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<30522x384xf32>} : () -> tensor<30522x384xf32>
  %24 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<2x384xf32>} : () -> tensor<2x384xf32>
  %25 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<512x384xf32>} : () -> tensor<512x384xf32>
  %26 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %27 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %28 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %29 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %30 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %31 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %32 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %33 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %34 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
  %35 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
  %36 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %37 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
  %38 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %39 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %40 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %41 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %42 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %43 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %44 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %45 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %46 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %47 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %48 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %49 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %50 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
  %51 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
  %52 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %53 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
  %54 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %55 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %56 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %57 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %58 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %59 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %60 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %61 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %62 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %63 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %64 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %65 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %66 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
  %67 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
  %68 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %69 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
  %70 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %71 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %72 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %73 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %74 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %75 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %76 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %77 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %78 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %79 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %80 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %81 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %82 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
  %83 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
  %84 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %85 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
  %86 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %87 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %88 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %89 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %90 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %91 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %92 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %93 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %94 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %95 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %96 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %97 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %98 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
  %99 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
  %100 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %101 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
  %102 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %103 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %104 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %105 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %106 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %107 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %108 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %109 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %110 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %111 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
  %112 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %113 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %114 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
  %115 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
  %116 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %117 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
  %118 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %119 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %120 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
  %121 = "arith.constant"() {value = dense<[-0.00115577725, 0.00115577038]> : tensor<2xf32>} : () -> tensor<2xf32>
  %122 = "arith.constant"() {value = 3.840000e+02 : f32} : () -> f32
  %123 = "hal.tensor.import"(%arg0) {target_encoding = tensor<1x512xi64>} : (!hal.buffer_view) -> tensor<1x512xi64>
  %124 = "linalg.init_tensor"() {static_sizes = []} : () -> tensor<i64>
  %125 = "linalg.fill"(%3, %124) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<i64>) -> tensor<i64>
  %126 = "tensor.extract"(%125) : (tensor<i64>) -> i64
  %127 = "arith.index_cast"(%126) : (i64) -> index
  %128 = "linalg.init_tensor"(%127) {static_sizes = [1, -1]} : (index) -> tensor<1x?xf32>
  %129 = "linalg.fill"(%19, %128) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x?xf32>) -> tensor<1x?xf32>
  %130 = "tensor.extract_slice"(%129, %127) {operand_segment_sizes = dense<[1, 0, 1, 0]> : vector<4xi32>, static_offsets = [0, 0], static_sizes = [1, -1], static_strides = [1, 1]} : (tensor<1x?xf32>, index) -> tensor<?xf32>
  %131 = "tensor.dim"(%130, %2) : (tensor<?xf32>, index) -> index
  %132 = "tensor.dim"(%130, %2) : (tensor<?xf32>, index) -> index
  %133 = "flow.tensor.reshape"(%130, %132, %0, %131) {operand_segment_sizes = dense<[1, 1, 2]> : vector<3xi32>} : (tensor<?xf32>, index, index, index) -> tensor<?x1x1x?xf32>
  %134 = "arith.cmpi"(%4, %126) {predicate = 4 : i64} : (i64, i64) -> i1
  %135 = "std.select"(%134, %126, %4) : (i1, i64, i64) -> i64
  %136 = "arith.index_cast"(%135) : (i64) -> index
  %137 = "arith.cmpi"(%20, %126) {predicate = 4 : i64} : (i64, i64) -> i1
  %138 = "std.select"(%137, %126, %20) : (i1, i64, i64) -> i64
  %139 = "arith.index_cast"(%138) : (i64) -> index
  %140 = "arith.cmpi"(%139, %136) {predicate = 5 : i64} : (index, index) -> i1
  %141 = "std.select"(%140, %139, %136) : (i1, index, index) -> index
  %142 = "arith.subi"(%141, %136) : (index, index) -> index
  %143 = "tensor.extract_slice"(%133, %136, %142) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, 0, 0, -9223372036854775808], static_sizes = [1, 1, 1, -1], static_strides = [1, 1, 1, 1]} : (tensor<?x1x1x?xf32>, index, index) -> tensor<?xf32>
  %144 = "linalg.init_tensor"(%142) {static_sizes = [-1]} : (index) -> tensor<?xf32>
  %145 = "linalg.generic"(%143, %144) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.subf"(%19, %arg1) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %13) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
  %146 = "tensor.dim"(%145, %2) : (tensor<?xf32>, index) -> index
  %147 = "tensor.dim"(%145, %2) : (tensor<?xf32>, index) -> index
  %148 = "linalg.fill"(%3, %124) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<i64>) -> tensor<i64>
  %149 = "tensor.extract"(%148) : (tensor<i64>) -> i64
  %150 = "arith.addi"(%149, %3) : (i64, i64) -> i64
  %151 = "arith.cmpi"(%149, %4) {predicate = 5 : i64} : (i64, i64) -> i1
  %152 = "std.select"(%151, %149, %150) : (i1, i64, i64) -> i64
  %153 = "arith.cmpi"(%152, %4) {predicate = 2 : i64} : (i64, i64) -> i1
  %154 = "std.select"(%153, %4, %152) : (i1, i64, i64) -> i64
  %155 = "arith.cmpi"(%154, %3) {predicate = 4 : i64} : (i64, i64) -> i1
  %156 = "std.select"(%155, %3, %154) : (i1, i64, i64) -> i64
  %157 = "arith.index_cast"(%156) : (i64) -> index
  %158 = "arith.cmpi"(%157, %2) {predicate = 5 : i64} : (index, index) -> i1
  %159 = "std.select"(%158, %157, %2) : (i1, index, index) -> index
  %160 = "tensor.extract_slice"(%22, %159) {operand_segment_sizes = dense<[1, 0, 1, 0]> : vector<4xi32>, static_offsets = [0, 0], static_sizes = [1, -1], static_strides = [1, 1]} : (tensor<1x512xi64>, index) -> tensor<?xi64>
  %161 = "flow.tensor.reshape"(%123) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<1x512xi64>) -> tensor<512xi64>
  %162 = "arith.cmpi"(%5, %159) {predicate = 0 : i64} : (index, index) -> i1
  "std.assert"(%162) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %163 = "linalg.init_tensor"() {static_sizes = [512, 384]} : () -> tensor<512x384xf32>
  %164 = "linalg.generic"(%161, %160, %163) ({
  ^bb0(%arg1: i64, %arg2: i64, %arg3: f32):
    %600 = "linalg.index"() {dim = 1 : i64} : () -> index
    %601 = "tensor.extract"(%24, %2, %600) : (tensor<2x384xf32>, index, index) -> f32
    %602 = "arith.index_cast"(%arg1) : (i64) -> index
    %603 = "arith.cmpi"(%arg1, %14) {predicate = 2 : i64} : (i64, i64) -> i1
    "std.assert"(%603) {msg = "index must be smaller than dim size"} : (i1) -> ()
    %604 = "arith.cmpi"(%arg1, %4) {predicate = 5 : i64} : (i64, i64) -> i1
    "std.assert"(%604) {msg = "index must be larger or equal to 0"} : (i1) -> ()
    %605 = "tensor.extract"(%23, %602, %600) : (tensor<30522x384xf32>, index, index) -> f32
    %606 = "arith.index_cast"(%arg2) : (i64) -> index
    %607 = "arith.cmpi"(%arg2, %3) {predicate = 2 : i64} : (i64, i64) -> i1
    "std.assert"(%607) {msg = "index must be smaller than dim size"} : (i1) -> ()
    %608 = "arith.cmpi"(%arg2, %4) {predicate = 5 : i64} : (i64, i64) -> i1
    "std.assert"(%608) {msg = "index must be larger or equal to 0"} : (i1) -> ()
    %609 = "tensor.extract"(%25, %606, %600) : (tensor<512x384xf32>, index, index) -> f32
    %610 = "arith.addf"(%605, %601) : (f32, f32) -> f32
    %611 = "arith.addf"(%610, %609) : (f32, f32) -> f32
    "linalg.yield"(%611) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<512xi64>, tensor<?xi64>, tensor<512x384xf32>) -> tensor<512x384xf32>
  %165 = "linalg.init_tensor"() {static_sizes = [512]} : () -> tensor<512xf32>
  %166 = "linalg.fill"(%15, %165) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<512xf32>) -> tensor<512xf32>
  %167 = "linalg.generic"(%164, %166) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512xf32>) -> tensor<512xf32>
  %168 = "linalg.generic"(%167, %165) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
  %169 = "linalg.fill"(%15, %165) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<512xf32>) -> tensor<512xf32>
  %170 = "linalg.generic"(%164, %168, %169) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
  %171 = "linalg.generic"(%164, %168, %170, %27, %26, %163) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512xf32>, tensor<512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
  %172 = "flow.tensor.reshape"(%171) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
  %173 = "linalg.generic"(%28, %163) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
  %174 = "flow.tensor.reshape"(%173) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
  %175 = "linalg.batch_matmul"(%172, %12, %174) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %176 = "linalg.generic"(%29, %163) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
  %177 = "flow.tensor.reshape"(%176) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
  %178 = "linalg.batch_matmul"(%172, %11, %177) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %179 = "flow.tensor.reshape"(%178, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<1x512x384xf32>, index, index) -> tensor<?x?x12x32xf32>
  %180 = "linalg.generic"(%30, %163) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
  %181 = "flow.tensor.reshape"(%180) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
  %182 = "linalg.batch_matmul"(%172, %10, %181) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %183 = "flow.tensor.reshape"(%182, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<1x512x384xf32>, index, index) -> tensor<?x?x12x32xf32>
  %184 = "flow.tensor.reshape"(%175, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<1x512x384xf32>, index, index) -> tensor<?x?x12x32xf32>
  %185 = "linalg.init_tensor"() {static_sizes = [12, 32, 512]} : () -> tensor<12x32x512xf32>
  %186 = "linalg.generic"(%179, %185) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<12x32x512xf32>) -> tensor<12x32x512xf32>
  %187 = "linalg.init_tensor"() {static_sizes = [12, 512, 32]} : () -> tensor<12x512x32xf32>
  %188 = "linalg.generic"(%184, %187) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
  %189 = "linalg.init_tensor"() {static_sizes = [12, 512, 512]} : () -> tensor<12x512x512xf32>
  %190 = "linalg.fill"(%15, %189) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
  %191 = "linalg.generic"(%188, %186, %190) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x32xf32>, tensor<12x32x512xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
  %192 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
  "std.assert"(%192) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %193 = "linalg.generic"(%191, %145, %189) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.truncf"(%9) : (f64) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<?xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
  %194 = "linalg.init_tensor"() {static_sizes = [12, 512]} : () -> tensor<12x512xf32>
  %195 = "linalg.fill"(%16, %194) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<12x512xf32>) -> tensor<12x512xf32>
  %196 = "linalg.init_tensor"() {static_sizes = [12, 512]} : () -> tensor<12x512xi64>
  %197 = "linalg.fill"(%4, %196) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<12x512xi64>) -> tensor<12x512xi64>
  %198:2 = "linalg.generic"(%193, %195, %197) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
    %600 = "linalg.index"() {dim = 2 : i64} : () -> index
    %601 = "arith.index_cast"(%600) : (index) -> i64
    %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
    %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
    %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
    "linalg.yield"(%603, %604) : (f32, i64) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>, tensor<12x512xi64>) -> (tensor<12x512xf32>, tensor<12x512xi64>)
  %199 = "linalg.generic"(%193, %198#0, %189) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "math.exp"(%600) : (f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
  %200 = "linalg.fill"(%15, %194) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<12x512xf32>) -> tensor<12x512xf32>
  %201 = "linalg.generic"(%199, %200) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>) -> tensor<12x512xf32>
  %202 = "linalg.generic"(%199, %201, %189) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
  %203 = "linalg.generic"(%183, %187) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
  %204 = "linalg.fill"(%15, %187) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
  %205 = "linalg.generic"(%202, %203, %204) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512x32xf32>, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
  %206 = "linalg.init_tensor"() {static_sizes = [512, 12, 32]} : () -> tensor<512x12x32xf32>
  %207 = "linalg.generic"(%205, %206) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d0, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<12x512x32xf32>, tensor<512x12x32xf32>) -> tensor<512x12x32xf32>
  %208 = "flow.tensor.reshape"(%207, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<512x12x32xf32>, index, index) -> tensor<?x?x384xf32>
  %209 = "linalg.generic"(%31, %163) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
  %210 = "flow.tensor.reshape"(%209) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
  %211 = "linalg.batch_matmul"(%208, %8, %210) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %212 = "linalg.init_tensor"() {static_sizes = [1, 512, 384]} : () -> tensor<1x512x384xf32>
  %213 = "flow.tensor.reshape"(%211) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<1x512x384xf32>) -> tensor<512x384xf32>
  %214 = "linalg.generic"(%213, %171, %212) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (0, d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %215 = "linalg.init_tensor"() {static_sizes = [1, 512]} : () -> tensor<1x512xf32>
  %216 = "linalg.fill"(%15, %215) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %217 = "linalg.generic"(%214, %216) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %218 = "linalg.generic"(%217, %215) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %219 = "linalg.fill"(%15, %215) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %220 = "linalg.generic"(%214, %218, %219) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %221 = "linalg.generic"(%214, %218, %220, %33, %32, %212) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %222 = "linalg.init_tensor"() {static_sizes = [1, 512, 1536]} : () -> tensor<1x512x1536xf32>
  %223 = "linalg.init_tensor"() {static_sizes = [1, 384, 1536]} : () -> tensor<1x384x1536xf32>
  %224 = "linalg.generic"(%34, %222) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %225 = "linalg.generic"(%35, %223) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<1x384x1536xf32>) -> tensor<1x384x1536xf32>
  %226 = "linalg.batch_matmul"(%221, %225, %224) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %227 = "linalg.generic"(%226, %222) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.sqrt"(%18) : (f32) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "math.erf"(%601) : (f32) -> f32
    %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
    %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
    %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
    "linalg.yield"(%605) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %228 = "linalg.init_tensor"() {static_sizes = [1, 1536, 384]} : () -> tensor<1x1536x384xf32>
  %229 = "linalg.generic"(%36, %212) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %230 = "linalg.generic"(%37, %228) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<1x1536x384xf32>) -> tensor<1x1536x384xf32>
  %231 = "linalg.batch_matmul"(%227, %230, %229) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x1536x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %232 = "linalg.generic"(%231, %221, %212) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %233 = "linalg.fill"(%15, %215) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %234 = "linalg.generic"(%232, %233) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %235 = "linalg.generic"(%234, %215) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %236 = "linalg.fill"(%15, %215) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %237 = "linalg.generic"(%232, %235, %236) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %238 = "linalg.generic"(%232, %235, %237, %39, %38, %212) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %239 = "linalg.init_tensor"() {static_sizes = [1, 384, 384]} : () -> tensor<1x384x384xf32>
  %240 = "linalg.generic"(%40, %212) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %241 = "linalg.generic"(%41, %239) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %242 = "linalg.batch_matmul"(%238, %241, %240) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %243 = "tensor.cast"(%242) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %244 = "linalg.generic"(%42, %212) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %245 = "linalg.generic"(%43, %239) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %246 = "linalg.batch_matmul"(%238, %245, %244) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %247 = "tensor.cast"(%246) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %248 = "flow.tensor.reshape"(%247, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %249 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 32]} : () -> tensor<1x12x512x32xf32>
  %250 = "linalg.generic"(%44, %212) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %251 = "linalg.generic"(%45, %239) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %252 = "linalg.batch_matmul"(%238, %251, %250) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %253 = "tensor.cast"(%252) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %254 = "flow.tensor.reshape"(%253, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %255 = "linalg.generic"(%254, %249) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %256 = "flow.tensor.reshape"(%243, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %257 = "linalg.generic"(%256, %249) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %258 = "linalg.init_tensor"() {static_sizes = [1, 12, 32, 512]} : () -> tensor<1x12x32x512xf32>
  %259 = "linalg.generic"(%248, %258) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x32x512xf32>) -> tensor<1x12x32x512xf32>
  %260 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 512]} : () -> tensor<1x12x512x512xf32>
  %261 = "linalg.fill"(%15, %260) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %262 = "linalg.generic"(%257, %259, %261) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x12x32x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  "std.assert"(%192) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %263 = "flow.tensor.reshape"(%145, %147, %146) {operand_segment_sizes = dense<1> : vector<3xi32>} : (tensor<?xf32>, index, index) -> tensor<1x?xf32>
  %264 = "linalg.generic"(%262, %263, %260) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.truncf"(%9) : (f64) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x?xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %265 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xf32>
  %266 = "linalg.fill"(%16, %265) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %267 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xi64>
  %268 = "linalg.fill"(%4, %267) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<1x12x512xi64>) -> tensor<1x12x512xi64>
  %269:2 = "linalg.generic"(%264, %266, %268) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
    %600 = "linalg.index"() {dim = 3 : i64} : () -> index
    %601 = "arith.index_cast"(%600) : (index) -> i64
    %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
    %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
    %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
    "linalg.yield"(%603, %604) : (f32, i64) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512xi64>) -> (tensor<1x12x512xf32>, tensor<1x12x512xi64>)
  %270 = "linalg.generic"(%264, %269#0, %260) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "math.exp"(%600) : (f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %271 = "linalg.fill"(%15, %265) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %272 = "linalg.generic"(%270, %271) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %273 = "linalg.generic"(%270, %272, %260) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %274 = "linalg.fill"(%15, %249) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %275 = "linalg.generic"(%273, %255, %274) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %276 = "linalg.init_tensor"() {static_sizes = [1, 512, 12, 32]} : () -> tensor<1x512x12x32xf32>
  %277 = "linalg.generic"(%275, %276) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x512x12x32xf32>) -> tensor<1x512x12x32xf32>
  %278 = "tensor.cast"(%277) : (tensor<1x512x12x32xf32>) -> tensor<?x?x12x32xf32>
  %279 = "flow.tensor.reshape"(%278, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
  %280 = "linalg.generic"(%46, %212) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %281 = "linalg.generic"(%47, %239) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %282 = "linalg.batch_matmul"(%279, %281, %280) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %283 = "linalg.init_tensor"() {static_sizes = [0, 512, 384]} : () -> tensor<0x512x384xf32>
  %284 = "linalg.generic"(%282, %238, %283) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %285 = "linalg.init_tensor"() {static_sizes = [0, 512]} : () -> tensor<0x512xf32>
  %286 = "linalg.fill"(%15, %285) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %287 = "linalg.generic"(%284, %286) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %288 = "linalg.generic"(%287, %285) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %289 = "linalg.fill"(%15, %285) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %290 = "linalg.generic"(%284, %288, %289) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %291 = "linalg.generic"(%284, %288, %290, %49, %48, %283) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %292 = "linalg.init_tensor"() {static_sizes = [0, 512, 1536]} : () -> tensor<0x512x1536xf32>
  %293 = "linalg.init_tensor"() {static_sizes = [0, 384, 1536]} : () -> tensor<0x384x1536xf32>
  %294 = "linalg.generic"(%50, %292) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %295 = "linalg.generic"(%51, %293) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<0x384x1536xf32>) -> tensor<0x384x1536xf32>
  %296 = "linalg.batch_matmul"(%291, %295, %294) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %297 = "linalg.generic"(%296, %292) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.sqrt"(%18) : (f32) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "math.erf"(%601) : (f32) -> f32
    %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
    %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
    %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
    "linalg.yield"(%605) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %298 = "linalg.init_tensor"() {static_sizes = [0, 1536, 384]} : () -> tensor<0x1536x384xf32>
  %299 = "linalg.generic"(%52, %283) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %300 = "linalg.generic"(%53, %298) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<0x1536x384xf32>) -> tensor<0x1536x384xf32>
  %301 = "linalg.batch_matmul"(%297, %300, %299) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x1536x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %302 = "linalg.generic"(%301, %291, %283) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %303 = "linalg.fill"(%15, %285) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %304 = "linalg.generic"(%302, %303) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %305 = "linalg.generic"(%304, %285) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %306 = "linalg.fill"(%15, %285) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %307 = "linalg.generic"(%302, %305, %306) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %308 = "linalg.generic"(%302, %305, %307, %55, %54, %283) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %309 = "linalg.init_tensor"() {static_sizes = [0, 384, 384]} : () -> tensor<0x384x384xf32>
  %310 = "linalg.generic"(%56, %283) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %311 = "linalg.generic"(%57, %309) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %312 = "linalg.batch_matmul"(%308, %311, %310) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %313 = "tensor.cast"(%312) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
  %314 = "linalg.generic"(%58, %283) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %315 = "linalg.generic"(%59, %309) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %316 = "linalg.batch_matmul"(%308, %315, %314) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %317 = "tensor.cast"(%316) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
  %318 = "flow.tensor.reshape"(%317, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %319 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 32]} : () -> tensor<0x12x512x32xf32>
  %320 = "linalg.generic"(%60, %283) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %321 = "linalg.generic"(%61, %309) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %322 = "linalg.batch_matmul"(%308, %321, %320) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %323 = "tensor.cast"(%322) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
  %324 = "flow.tensor.reshape"(%323, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %325 = "linalg.generic"(%324, %319) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %326 = "flow.tensor.reshape"(%313, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %327 = "linalg.generic"(%326, %319) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %328 = "linalg.init_tensor"() {static_sizes = [0, 12, 32, 512]} : () -> tensor<0x12x32x512xf32>
  %329 = "linalg.generic"(%318, %328) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x32x512xf32>) -> tensor<0x12x32x512xf32>
  %330 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 512]} : () -> tensor<0x12x512x512xf32>
  %331 = "linalg.fill"(%15, %330) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %332 = "linalg.generic"(%327, %329, %331) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x12x32x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  "std.assert"(%192) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %333 = "linalg.generic"(%332, %263, %330) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.truncf"(%9) : (f64) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<1x?xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %334 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xf32>
  %335 = "linalg.fill"(%16, %334) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
  %336 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xi64>
  %337 = "linalg.fill"(%4, %336) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<0x12x512xi64>) -> tensor<0x12x512xi64>
  %338:2 = "linalg.generic"(%333, %335, %337) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
    %600 = "linalg.index"() {dim = 3 : i64} : () -> index
    %601 = "arith.index_cast"(%600) : (index) -> i64
    %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
    %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
    %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
    "linalg.yield"(%603, %604) : (f32, i64) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512xi64>) -> (tensor<0x12x512xf32>, tensor<0x12x512xi64>)
  %339 = "linalg.generic"(%333, %338#0, %330) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "math.exp"(%600) : (f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %340 = "linalg.fill"(%15, %334) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
  %341 = "linalg.generic"(%339, %340) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
  %342 = "linalg.generic"(%339, %341, %330) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %343 = "linalg.fill"(%15, %319) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %344 = "linalg.generic"(%342, %325, %343) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %345 = "linalg.init_tensor"() {static_sizes = [0, 512, 12, 32]} : () -> tensor<0x512x12x32xf32>
  %346 = "linalg.generic"(%344, %345) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x512x12x32xf32>) -> tensor<0x512x12x32xf32>
  %347 = "tensor.cast"(%346) : (tensor<0x512x12x32xf32>) -> tensor<?x?x12x32xf32>
  %348 = "flow.tensor.reshape"(%347, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
  %349 = "linalg.generic"(%62, %283) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %350 = "linalg.generic"(%63, %309) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %351 = "linalg.batch_matmul"(%348, %350, %349) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %352 = "linalg.init_tensor"() {static_sizes = [1, 512, 384]} : () -> tensor<1x512x384xf32>
  %353 = "linalg.generic"(%351, %308, %352) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %354 = "linalg.init_tensor"() {static_sizes = [1, 512]} : () -> tensor<1x512xf32>
  %355 = "linalg.fill"(%15, %354) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %356 = "linalg.generic"(%353, %355) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %357 = "linalg.generic"(%356, %354) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %358 = "linalg.fill"(%15, %354) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %359 = "linalg.generic"(%353, %357, %358) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %360 = "linalg.generic"(%353, %357, %359, %65, %64, %352) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %361 = "linalg.init_tensor"() {static_sizes = [1, 512, 1536]} : () -> tensor<1x512x1536xf32>
  %362 = "linalg.init_tensor"() {static_sizes = [1, 384, 1536]} : () -> tensor<1x384x1536xf32>
  %363 = "linalg.generic"(%66, %361) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %364 = "linalg.generic"(%67, %362) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<1x384x1536xf32>) -> tensor<1x384x1536xf32>
  %365 = "linalg.batch_matmul"(%360, %364, %363) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %366 = "linalg.generic"(%365, %361) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.sqrt"(%18) : (f32) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "math.erf"(%601) : (f32) -> f32
    %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
    %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
    %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
    "linalg.yield"(%605) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %367 = "linalg.init_tensor"() {static_sizes = [1, 1536, 384]} : () -> tensor<1x1536x384xf32>
  %368 = "linalg.generic"(%68, %352) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %369 = "linalg.generic"(%69, %367) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<1x1536x384xf32>) -> tensor<1x1536x384xf32>
  %370 = "linalg.batch_matmul"(%366, %369, %368) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x1536x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %371 = "linalg.generic"(%370, %360, %352) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %372 = "linalg.fill"(%15, %354) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %373 = "linalg.generic"(%371, %372) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %374 = "linalg.generic"(%373, %354) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %375 = "linalg.fill"(%15, %354) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %376 = "linalg.generic"(%371, %374, %375) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %377 = "linalg.generic"(%371, %374, %376, %71, %70, %352) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %378 = "linalg.init_tensor"() {static_sizes = [1, 384, 384]} : () -> tensor<1x384x384xf32>
  %379 = "linalg.generic"(%72, %352) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %380 = "linalg.generic"(%73, %378) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %381 = "linalg.batch_matmul"(%377, %380, %379) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %382 = "tensor.cast"(%381) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %383 = "linalg.generic"(%74, %352) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %384 = "linalg.generic"(%75, %378) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %385 = "linalg.batch_matmul"(%377, %384, %383) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %386 = "tensor.cast"(%385) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %387 = "flow.tensor.reshape"(%386, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %388 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 32]} : () -> tensor<1x12x512x32xf32>
  %389 = "linalg.generic"(%76, %352) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %390 = "linalg.generic"(%77, %378) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %391 = "linalg.batch_matmul"(%377, %390, %389) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %392 = "tensor.cast"(%391) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %393 = "flow.tensor.reshape"(%392, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %394 = "linalg.generic"(%393, %388) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %395 = "flow.tensor.reshape"(%382, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %396 = "linalg.generic"(%395, %388) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %397 = "linalg.init_tensor"() {static_sizes = [1, 12, 32, 512]} : () -> tensor<1x12x32x512xf32>
  %398 = "linalg.generic"(%387, %397) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x32x512xf32>) -> tensor<1x12x32x512xf32>
  %399 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 512]} : () -> tensor<1x12x512x512xf32>
  %400 = "linalg.fill"(%15, %399) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %401 = "linalg.generic"(%396, %398, %400) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x12x32x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %402 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
  "std.assert"(%402) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %403 = "linalg.generic"(%401, %263, %399) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.truncf"(%9) : (f64) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x?xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %404 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xf32>
  %405 = "linalg.fill"(%16, %404) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %406 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xi64>
  %407 = "linalg.fill"(%4, %406) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<1x12x512xi64>) -> tensor<1x12x512xi64>
  %408:2 = "linalg.generic"(%403, %405, %407) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
    %600 = "linalg.index"() {dim = 3 : i64} : () -> index
    %601 = "arith.index_cast"(%600) : (index) -> i64
    %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
    %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
    %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
    "linalg.yield"(%603, %604) : (f32, i64) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512xi64>) -> (tensor<1x12x512xf32>, tensor<1x12x512xi64>)
  %409 = "linalg.generic"(%403, %408#0, %399) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "math.exp"(%600) : (f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %410 = "linalg.fill"(%15, %404) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %411 = "linalg.generic"(%409, %410) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %412 = "linalg.generic"(%409, %411, %399) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %413 = "linalg.fill"(%15, %388) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %414 = "linalg.generic"(%412, %394, %413) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %415 = "linalg.init_tensor"() {static_sizes = [1, 512, 12, 32]} : () -> tensor<1x512x12x32xf32>
  %416 = "linalg.generic"(%414, %415) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x512x12x32xf32>) -> tensor<1x512x12x32xf32>
  %417 = "tensor.cast"(%416) : (tensor<1x512x12x32xf32>) -> tensor<?x?x12x32xf32>
  %418 = "flow.tensor.reshape"(%417, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
  %419 = "linalg.generic"(%78, %352) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %420 = "linalg.generic"(%79, %378) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %421 = "linalg.batch_matmul"(%418, %420, %419) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %422 = "linalg.init_tensor"() {static_sizes = [0, 512, 384]} : () -> tensor<0x512x384xf32>
  %423 = "linalg.generic"(%421, %377, %422) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %424 = "linalg.init_tensor"() {static_sizes = [0, 512]} : () -> tensor<0x512xf32>
  %425 = "linalg.fill"(%15, %424) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %426 = "linalg.generic"(%423, %425) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %427 = "linalg.generic"(%426, %424) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %428 = "linalg.fill"(%15, %424) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %429 = "linalg.generic"(%423, %427, %428) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %430 = "linalg.generic"(%423, %427, %429, %81, %80, %422) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %431 = "linalg.init_tensor"() {static_sizes = [0, 512, 1536]} : () -> tensor<0x512x1536xf32>
  %432 = "linalg.init_tensor"() {static_sizes = [0, 384, 1536]} : () -> tensor<0x384x1536xf32>
  %433 = "linalg.generic"(%82, %431) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %434 = "linalg.generic"(%83, %432) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<0x384x1536xf32>) -> tensor<0x384x1536xf32>
  %435 = "linalg.batch_matmul"(%430, %434, %433) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %436 = "linalg.generic"(%435, %431) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.sqrt"(%18) : (f32) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "math.erf"(%601) : (f32) -> f32
    %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
    %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
    %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
    "linalg.yield"(%605) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %437 = "linalg.init_tensor"() {static_sizes = [0, 1536, 384]} : () -> tensor<0x1536x384xf32>
  %438 = "linalg.generic"(%84, %422) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %439 = "linalg.generic"(%85, %437) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<0x1536x384xf32>) -> tensor<0x1536x384xf32>
  %440 = "linalg.batch_matmul"(%436, %439, %438) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x1536x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %441 = "linalg.generic"(%440, %430, %422) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %442 = "linalg.fill"(%15, %424) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %443 = "linalg.generic"(%441, %442) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %444 = "linalg.generic"(%443, %424) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %445 = "linalg.fill"(%15, %424) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %446 = "linalg.generic"(%441, %444, %445) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %447 = "linalg.generic"(%441, %444, %446, %87, %86, %422) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %448 = "linalg.init_tensor"() {static_sizes = [0, 384, 384]} : () -> tensor<0x384x384xf32>
  %449 = "linalg.generic"(%88, %422) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %450 = "linalg.generic"(%89, %448) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %451 = "linalg.batch_matmul"(%447, %450, %449) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %452 = "tensor.cast"(%451) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
  %453 = "linalg.generic"(%90, %422) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %454 = "linalg.generic"(%91, %448) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %455 = "linalg.batch_matmul"(%447, %454, %453) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %456 = "tensor.cast"(%455) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
  %457 = "flow.tensor.reshape"(%456, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %458 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 32]} : () -> tensor<0x12x512x32xf32>
  %459 = "linalg.generic"(%92, %422) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %460 = "linalg.generic"(%93, %448) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %461 = "linalg.batch_matmul"(%447, %460, %459) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %462 = "tensor.cast"(%461) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
  %463 = "flow.tensor.reshape"(%462, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %464 = "linalg.generic"(%463, %458) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %465 = "flow.tensor.reshape"(%452, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %466 = "linalg.generic"(%465, %458) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %467 = "linalg.init_tensor"() {static_sizes = [0, 12, 32, 512]} : () -> tensor<0x12x32x512xf32>
  %468 = "linalg.generic"(%457, %467) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x32x512xf32>) -> tensor<0x12x32x512xf32>
  %469 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 512]} : () -> tensor<0x12x512x512xf32>
  %470 = "linalg.fill"(%15, %469) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %471 = "linalg.generic"(%466, %468, %470) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x12x32x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %472 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
  "std.assert"(%472) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %473 = "linalg.generic"(%471, %263, %469) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.truncf"(%9) : (f64) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<1x?xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %474 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xf32>
  %475 = "linalg.fill"(%16, %474) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
  %476 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xi64>
  %477 = "linalg.fill"(%4, %476) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<0x12x512xi64>) -> tensor<0x12x512xi64>
  %478:2 = "linalg.generic"(%473, %475, %477) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
    %600 = "linalg.index"() {dim = 3 : i64} : () -> index
    %601 = "arith.index_cast"(%600) : (index) -> i64
    %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
    %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
    %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
    "linalg.yield"(%603, %604) : (f32, i64) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512xi64>) -> (tensor<0x12x512xf32>, tensor<0x12x512xi64>)
  %479 = "linalg.generic"(%473, %478#0, %469) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "math.exp"(%600) : (f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %480 = "linalg.fill"(%15, %474) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
  %481 = "linalg.generic"(%479, %480) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
  %482 = "linalg.generic"(%479, %481, %469) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
  %483 = "linalg.fill"(%15, %458) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %484 = "linalg.generic"(%482, %464, %483) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
  %485 = "linalg.init_tensor"() {static_sizes = [0, 512, 12, 32]} : () -> tensor<0x512x12x32xf32>
  %486 = "linalg.generic"(%484, %485) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x512x12x32xf32>) -> tensor<0x512x12x32xf32>
  %487 = "tensor.cast"(%486) : (tensor<0x512x12x32xf32>) -> tensor<?x?x12x32xf32>
  %488 = "flow.tensor.reshape"(%487, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
  %489 = "linalg.generic"(%94, %422) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %490 = "linalg.generic"(%95, %448) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
  %491 = "linalg.batch_matmul"(%488, %490, %489) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %492 = "linalg.init_tensor"() {static_sizes = [1, 512, 384]} : () -> tensor<1x512x384xf32>
  %493 = "linalg.generic"(%491, %447, %492) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %494 = "linalg.init_tensor"() {static_sizes = [1, 512]} : () -> tensor<1x512xf32>
  %495 = "linalg.fill"(%15, %494) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %496 = "linalg.generic"(%493, %495) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %497 = "linalg.generic"(%496, %494) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %498 = "linalg.fill"(%15, %494) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %499 = "linalg.generic"(%493, %497, %498) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %500 = "linalg.generic"(%493, %497, %499, %97, %96, %492) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %501 = "linalg.init_tensor"() {static_sizes = [1, 512, 1536]} : () -> tensor<1x512x1536xf32>
  %502 = "linalg.init_tensor"() {static_sizes = [1, 384, 1536]} : () -> tensor<1x384x1536xf32>
  %503 = "linalg.generic"(%98, %501) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %504 = "linalg.generic"(%99, %502) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<1x384x1536xf32>) -> tensor<1x384x1536xf32>
  %505 = "linalg.batch_matmul"(%500, %504, %503) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %506 = "linalg.generic"(%505, %501) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.sqrt"(%18) : (f32) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "math.erf"(%601) : (f32) -> f32
    %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
    %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
    %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
    "linalg.yield"(%605) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
  %507 = "linalg.init_tensor"() {static_sizes = [1, 1536, 384]} : () -> tensor<1x1536x384xf32>
  %508 = "linalg.generic"(%100, %492) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %509 = "linalg.generic"(%101, %507) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<1x1536x384xf32>) -> tensor<1x1536x384xf32>
  %510 = "linalg.batch_matmul"(%506, %509, %508) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x1536x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %511 = "linalg.generic"(%510, %500, %492) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %512 = "linalg.fill"(%15, %494) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %513 = "linalg.generic"(%511, %512) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %514 = "linalg.generic"(%513, %494) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %515 = "linalg.fill"(%15, %494) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
  %516 = "linalg.generic"(%511, %514, %515) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
  %517 = "linalg.generic"(%511, %514, %516, %103, %102, %492) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %518 = "linalg.init_tensor"() {static_sizes = [1, 384, 384]} : () -> tensor<1x384x384xf32>
  %519 = "linalg.generic"(%104, %492) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %520 = "linalg.generic"(%105, %518) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %521 = "linalg.batch_matmul"(%517, %520, %519) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %522 = "tensor.cast"(%521) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %523 = "linalg.generic"(%106, %492) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %524 = "linalg.generic"(%107, %518) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %525 = "linalg.batch_matmul"(%517, %524, %523) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %526 = "tensor.cast"(%525) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %527 = "flow.tensor.reshape"(%526, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %528 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 32]} : () -> tensor<1x12x512x32xf32>
  %529 = "linalg.generic"(%108, %492) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %530 = "linalg.generic"(%109, %518) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %531 = "linalg.batch_matmul"(%517, %530, %529) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %532 = "tensor.cast"(%531) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
  %533 = "flow.tensor.reshape"(%532, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %534 = "linalg.generic"(%533, %528) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %535 = "flow.tensor.reshape"(%522, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
  %536 = "linalg.generic"(%535, %528) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %537 = "linalg.init_tensor"() {static_sizes = [1, 12, 32, 512]} : () -> tensor<1x12x32x512xf32>
  %538 = "linalg.generic"(%527, %537) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x32x512xf32>) -> tensor<1x12x32x512xf32>
  %539 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 512]} : () -> tensor<1x12x512x512xf32>
  %540 = "linalg.fill"(%15, %539) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %541 = "linalg.generic"(%536, %538, %540) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x12x32x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %542 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
  "std.assert"(%542) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %543 = "linalg.generic"(%541, %263, %539) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.truncf"(%9) : (f64) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x?xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %544 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xf32>
  %545 = "linalg.fill"(%16, %544) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %546 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xi64>
  %547 = "linalg.fill"(%4, %546) ({
  ^bb0(%arg1: i64, %arg2: i64):
    "linalg.yield"(%arg1) : (i64) -> ()
  }) : (i64, tensor<1x12x512xi64>) -> tensor<1x12x512xi64>
  %548:2 = "linalg.generic"(%543, %545, %547) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
    %600 = "linalg.index"() {dim = 3 : i64} : () -> index
    %601 = "arith.index_cast"(%600) : (index) -> i64
    %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
    %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
    %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
    "linalg.yield"(%603, %604) : (f32, i64) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512xi64>) -> (tensor<1x12x512xf32>, tensor<1x12x512xi64>)
  %549 = "linalg.generic"(%543, %548#0, %539) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "math.exp"(%600) : (f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %550 = "linalg.fill"(%15, %544) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %551 = "linalg.generic"(%549, %550) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
  %552 = "linalg.generic"(%549, %551, %539) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
  %553 = "linalg.fill"(%15, %528) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %554 = "linalg.generic"(%552, %534, %553) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
  %555 = "linalg.init_tensor"() {static_sizes = [1, 512, 12, 32]} : () -> tensor<1x512x12x32xf32>
  %556 = "linalg.generic"(%554, %555) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x512x12x32xf32>) -> tensor<1x512x12x32xf32>
  %557 = "tensor.cast"(%556) : (tensor<1x512x12x32xf32>) -> tensor<?x?x12x32xf32>
  %558 = "flow.tensor.reshape"(%557, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
  %559 = "linalg.generic"(%110, %492) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  %560 = "linalg.generic"(%111, %518) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
  %561 = "linalg.batch_matmul"(%558, %560, %559) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
  "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
  %562 = "linalg.init_tensor"() {static_sizes = [0, 512, 384]} : () -> tensor<0x512x384xf32>
  %563 = "linalg.generic"(%561, %517, %562) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %564 = "linalg.init_tensor"() {static_sizes = [0, 512]} : () -> tensor<0x512xf32>
  %565 = "linalg.fill"(%15, %564) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %566 = "linalg.generic"(%563, %565) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %567 = "linalg.generic"(%566, %564) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %568 = "linalg.fill"(%15, %564) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %569 = "linalg.generic"(%563, %567, %568) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %570 = "linalg.generic"(%563, %567, %569, %113, %112, %562) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %571 = "linalg.init_tensor"() {static_sizes = [0, 512, 1536]} : () -> tensor<0x512x1536xf32>
  %572 = "linalg.init_tensor"() {static_sizes = [0, 384, 1536]} : () -> tensor<0x384x1536xf32>
  %573 = "linalg.generic"(%114, %571) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %574 = "linalg.generic"(%115, %572) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<0x384x1536xf32>) -> tensor<0x384x1536xf32>
  %575 = "linalg.batch_matmul"(%570, %574, %573) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %576 = "linalg.generic"(%575, %571) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.sqrt"(%18) : (f32) -> f32
    %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
    %602 = "math.erf"(%601) : (f32) -> f32
    %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
    %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
    %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
    "linalg.yield"(%605) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
  %577 = "linalg.init_tensor"() {static_sizes = [0, 1536, 384]} : () -> tensor<0x1536x384xf32>
  %578 = "linalg.generic"(%116, %562) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %579 = "linalg.generic"(%117, %577) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<0x1536x384xf32>) -> tensor<0x1536x384xf32>
  %580 = "linalg.batch_matmul"(%576, %579, %578) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x1536x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %581 = "linalg.generic"(%580, %570, %562) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %582 = "linalg.fill"(%15, %564) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %583 = "linalg.generic"(%581, %582) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %584 = "linalg.generic"(%583, %564) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %585 = "linalg.fill"(%15, %564) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
  %586 = "linalg.generic"(%581, %584, %585) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
    %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
    "linalg.yield"(%602) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
  %587 = "linalg.generic"(%581, %584, %586, %119, %118, %562) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
    %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
    %602 = "arith.truncf"(%21) : (f64) -> f32
    %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
    %604 = "math.rsqrt"(%603) : (f32) -> f32
    %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
    %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
    %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
    "linalg.yield"(%607) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
  %588 = "tensor.extract_slice"(%587) {operand_segment_sizes = dense<[1, 0, 0, 0]> : vector<4xi32>, static_offsets = [0, 0, 0], static_sizes = [0, 1, 384], static_strides = [1, 1, 1]} : (tensor<0x512x384xf32>) -> tensor<0x1x384xf32>
  %589 = "tensor.cast"(%588) : (tensor<0x1x384xf32>) -> tensor<?x?x384xf32>
  %590 = "flow.tensor.reshape"(%589, %2, %0, %2) {operand_segment_sizes = dense<[1, 2, 1]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index) -> tensor<?x384xf32>
  %591 = "linalg.init_tensor"() {static_sizes = [0, 384]} : () -> tensor<0x384xf32>
  %592 = "linalg.generic"(%120, %591) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x384xf32>) -> tensor<0x384xf32>
  %593 = "linalg.matmul"(%590, %7, %592) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x384xf32>, tensor<384x384xf32>, tensor<0x384xf32>) -> tensor<0x384xf32>
  %594 = "linalg.generic"(%593, %591) ({
  ^bb0(%arg1: f32, %arg2: f32):
    %600 = "math.tanh"(%arg1) : (f32) -> f32
    "linalg.yield"(%600) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x384xf32>, tensor<0x384xf32>) -> tensor<0x384xf32>
  %595 = "linalg.init_tensor"() {static_sizes = [0, 2]} : () -> tensor<0x2xf32>
  %596 = "linalg.generic"(%121, %595) ({
  ^bb0(%arg1: f32, %arg2: f32):
    "linalg.yield"(%arg1) : (f32) -> ()
  }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<2xf32>, tensor<0x2xf32>) -> tensor<0x2xf32>
  %597 = "linalg.matmul"(%594, %6, %596) ({
  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
    %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
    %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
    "linalg.yield"(%601) : (f32) -> ()
  }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x384xf32>, tensor<384x2xf32>, tensor<0x2xf32>) -> tensor<0x2xf32>
  %598 = "tensor.cast"(%597) : (tensor<0x2xf32>) -> tensor<?x2xf32>
  %599 = "hal.tensor.export"(%598, %2) {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>, source_encoding = tensor<?x2xf32>} : (tensor<?x2xf32>, index) -> !hal.buffer_view
  "std.return"(%599) : (!hal.buffer_view) -> ()
}) {iree.abi.stub, sym_name = "forward", type = (!hal.buffer_view) -> !hal.buffer_view} : () -> ()

bert_input_ir.mlir:19:1: error: conversion from source -> vm failed
module attributes {torch.debug_module_name = "MiniLMSequenceClassification"} {
^
bert_input_ir.mlir:19:1: note: see current operation: "builtin.module"() ({
  "builtin.func"() ({
  ^bb0(%arg0: !hal.buffer_view):
    %0 = "arith.constant"() {value = 1 : index} : () -> index
    %1 = "arith.constant"() {value = false} : () -> i1
    %2 = "arith.constant"() {value = 0 : index} : () -> index
    %3 = "arith.constant"() {value = 512 : i64} : () -> i64
    %4 = "arith.constant"() {value = 0 : i64} : () -> i64
    %5 = "arith.constant"() {value = 512 : index} : () -> index
    %6 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x2xf32>} : () -> tensor<384x2xf32>
    %7 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %8 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
    %9 = "arith.constant"() {value = 5.6568542494923806 : f64} : () -> f64
    %10 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
    %11 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
    %12 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x384x384xf32>} : () -> tensor<1x384x384xf32>
    %13 = "arith.constant"() {value = -1.000000e+04 : f32} : () -> f32
    %14 = "arith.constant"() {value = 30522 : i64} : () -> i64
    %15 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
    %16 = "arith.constant"() {value = -3.40282347E+38 : f32} : () -> f32
    %17 = "arith.constant"() {value = 5.000000e-01 : f32} : () -> f32
    %18 = "arith.constant"() {value = 2.000000e+00 : f32} : () -> f32
    %19 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
    %20 = "arith.constant"() {value = 9223372036854775807 : i64} : () -> i64
    %21 = "arith.constant"() {value = 9.9999999999999998E-13 : f64} : () -> f64
    %22 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1x512xi64>} : () -> tensor<1x512xi64>
    %23 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<30522x384xf32>} : () -> tensor<30522x384xf32>
    %24 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<2x384xf32>} : () -> tensor<2x384xf32>
    %25 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<512x384xf32>} : () -> tensor<512x384xf32>
    %26 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %27 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %28 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %29 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %30 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %31 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %32 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %33 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %34 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
    %35 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
    %36 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %37 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
    %38 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %39 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %40 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %41 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %42 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %43 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %44 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %45 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %46 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %47 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %48 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %49 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %50 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
    %51 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
    %52 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %53 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
    %54 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %55 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %56 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %57 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %58 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %59 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %60 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %61 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %62 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %63 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %64 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %65 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %66 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
    %67 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
    %68 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %69 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
    %70 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %71 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %72 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %73 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %74 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %75 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %76 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %77 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %78 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %79 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %80 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %81 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %82 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
    %83 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
    %84 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %85 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
    %86 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %87 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %88 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %89 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %90 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %91 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %92 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %93 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %94 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %95 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %96 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %97 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %98 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
    %99 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
    %100 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %101 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
    %102 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %103 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %104 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %105 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %106 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %107 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %108 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %109 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %110 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %111 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x384xf32>} : () -> tensor<384x384xf32>
    %112 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %113 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %114 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536xf32>} : () -> tensor<1536xf32>
    %115 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<1536x384xf32>} : () -> tensor<1536x384xf32>
    %116 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %117 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384x1536xf32>} : () -> tensor<384x1536xf32>
    %118 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %119 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %120 = "arith.constant"() {value = opaque<"elided_large_const", "0xDEADBEEF"> : tensor<384xf32>} : () -> tensor<384xf32>
    %121 = "arith.constant"() {value = dense<[-0.00115577725, 0.00115577038]> : tensor<2xf32>} : () -> tensor<2xf32>
    %122 = "arith.constant"() {value = 3.840000e+02 : f32} : () -> f32
    %123 = "hal.tensor.import"(%arg0) {target_encoding = tensor<1x512xi64>} : (!hal.buffer_view) -> tensor<1x512xi64>
    %124 = "linalg.init_tensor"() {static_sizes = []} : () -> tensor<i64>
    %125 = "linalg.fill"(%3, %124) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<i64>) -> tensor<i64>
    %126 = "tensor.extract"(%125) : (tensor<i64>) -> i64
    %127 = "arith.index_cast"(%126) : (i64) -> index
    %128 = "linalg.init_tensor"(%127) {static_sizes = [1, -1]} : (index) -> tensor<1x?xf32>
    %129 = "linalg.fill"(%19, %128) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x?xf32>) -> tensor<1x?xf32>
    %130 = "tensor.extract_slice"(%129, %127) {operand_segment_sizes = dense<[1, 0, 1, 0]> : vector<4xi32>, static_offsets = [0, 0], static_sizes = [1, -1], static_strides = [1, 1]} : (tensor<1x?xf32>, index) -> tensor<?xf32>
    %131 = "tensor.dim"(%130, %2) : (tensor<?xf32>, index) -> index
    %132 = "tensor.dim"(%130, %2) : (tensor<?xf32>, index) -> index
    %133 = "flow.tensor.reshape"(%130, %132, %0, %131) {operand_segment_sizes = dense<[1, 1, 2]> : vector<3xi32>} : (tensor<?xf32>, index, index, index) -> tensor<?x1x1x?xf32>
    %134 = "arith.cmpi"(%4, %126) {predicate = 4 : i64} : (i64, i64) -> i1
    %135 = "std.select"(%134, %126, %4) : (i1, i64, i64) -> i64
    %136 = "arith.index_cast"(%135) : (i64) -> index
    %137 = "arith.cmpi"(%20, %126) {predicate = 4 : i64} : (i64, i64) -> i1
    %138 = "std.select"(%137, %126, %20) : (i1, i64, i64) -> i64
    %139 = "arith.index_cast"(%138) : (i64) -> index
    %140 = "arith.cmpi"(%139, %136) {predicate = 5 : i64} : (index, index) -> i1
    %141 = "std.select"(%140, %139, %136) : (i1, index, index) -> index
    %142 = "arith.subi"(%141, %136) : (index, index) -> index
    %143 = "tensor.extract_slice"(%133, %136, %142) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, 0, 0, -9223372036854775808], static_sizes = [1, 1, 1, -1], static_strides = [1, 1, 1, 1]} : (tensor<?x1x1x?xf32>, index, index) -> tensor<?xf32>
    %144 = "linalg.init_tensor"(%142) {static_sizes = [-1]} : (index) -> tensor<?xf32>
    %145 = "linalg.generic"(%143, %144) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.subf"(%19, %arg1) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %13) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
    %146 = "tensor.dim"(%145, %2) : (tensor<?xf32>, index) -> index
    %147 = "tensor.dim"(%145, %2) : (tensor<?xf32>, index) -> index
    %148 = "linalg.fill"(%3, %124) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<i64>) -> tensor<i64>
    %149 = "tensor.extract"(%148) : (tensor<i64>) -> i64
    %150 = "arith.addi"(%149, %3) : (i64, i64) -> i64
    %151 = "arith.cmpi"(%149, %4) {predicate = 5 : i64} : (i64, i64) -> i1
    %152 = "std.select"(%151, %149, %150) : (i1, i64, i64) -> i64
    %153 = "arith.cmpi"(%152, %4) {predicate = 2 : i64} : (i64, i64) -> i1
    %154 = "std.select"(%153, %4, %152) : (i1, i64, i64) -> i64
    %155 = "arith.cmpi"(%154, %3) {predicate = 4 : i64} : (i64, i64) -> i1
    %156 = "std.select"(%155, %3, %154) : (i1, i64, i64) -> i64
    %157 = "arith.index_cast"(%156) : (i64) -> index
    %158 = "arith.cmpi"(%157, %2) {predicate = 5 : i64} : (index, index) -> i1
    %159 = "std.select"(%158, %157, %2) : (i1, index, index) -> index
    %160 = "tensor.extract_slice"(%22, %159) {operand_segment_sizes = dense<[1, 0, 1, 0]> : vector<4xi32>, static_offsets = [0, 0], static_sizes = [1, -1], static_strides = [1, 1]} : (tensor<1x512xi64>, index) -> tensor<?xi64>
    %161 = "flow.tensor.reshape"(%123) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<1x512xi64>) -> tensor<512xi64>
    %162 = "arith.cmpi"(%5, %159) {predicate = 0 : i64} : (index, index) -> i1
    "std.assert"(%162) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %163 = "linalg.init_tensor"() {static_sizes = [512, 384]} : () -> tensor<512x384xf32>
    %164 = "linalg.generic"(%161, %160, %163) ({
    ^bb0(%arg1: i64, %arg2: i64, %arg3: f32):
      %600 = "linalg.index"() {dim = 1 : i64} : () -> index
      %601 = "tensor.extract"(%24, %2, %600) : (tensor<2x384xf32>, index, index) -> f32
      %602 = "arith.index_cast"(%arg1) : (i64) -> index
      %603 = "arith.cmpi"(%arg1, %14) {predicate = 2 : i64} : (i64, i64) -> i1
      "std.assert"(%603) {msg = "index must be smaller than dim size"} : (i1) -> ()
      %604 = "arith.cmpi"(%arg1, %4) {predicate = 5 : i64} : (i64, i64) -> i1
      "std.assert"(%604) {msg = "index must be larger or equal to 0"} : (i1) -> ()
      %605 = "tensor.extract"(%23, %602, %600) : (tensor<30522x384xf32>, index, index) -> f32
      %606 = "arith.index_cast"(%arg2) : (i64) -> index
      %607 = "arith.cmpi"(%arg2, %3) {predicate = 2 : i64} : (i64, i64) -> i1
      "std.assert"(%607) {msg = "index must be smaller than dim size"} : (i1) -> ()
      %608 = "arith.cmpi"(%arg2, %4) {predicate = 5 : i64} : (i64, i64) -> i1
      "std.assert"(%608) {msg = "index must be larger or equal to 0"} : (i1) -> ()
      %609 = "tensor.extract"(%25, %606, %600) : (tensor<512x384xf32>, index, index) -> f32
      %610 = "arith.addf"(%605, %601) : (f32, f32) -> f32
      %611 = "arith.addf"(%610, %609) : (f32, f32) -> f32
      "linalg.yield"(%611) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<512xi64>, tensor<?xi64>, tensor<512x384xf32>) -> tensor<512x384xf32>
    %165 = "linalg.init_tensor"() {static_sizes = [512]} : () -> tensor<512xf32>
    %166 = "linalg.fill"(%15, %165) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<512xf32>) -> tensor<512xf32>
    %167 = "linalg.generic"(%164, %166) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512xf32>) -> tensor<512xf32>
    %168 = "linalg.generic"(%167, %165) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
    %169 = "linalg.fill"(%15, %165) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<512xf32>) -> tensor<512xf32>
    %170 = "linalg.generic"(%164, %168, %169) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
    %171 = "linalg.generic"(%164, %168, %170, %27, %26, %163) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512xf32>, tensor<512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
    %172 = "flow.tensor.reshape"(%171) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
    %173 = "linalg.generic"(%28, %163) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
    %174 = "flow.tensor.reshape"(%173) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
    %175 = "linalg.batch_matmul"(%172, %12, %174) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %176 = "linalg.generic"(%29, %163) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
    %177 = "flow.tensor.reshape"(%176) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
    %178 = "linalg.batch_matmul"(%172, %11, %177) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %179 = "flow.tensor.reshape"(%178, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<1x512x384xf32>, index, index) -> tensor<?x?x12x32xf32>
    %180 = "linalg.generic"(%30, %163) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
    %181 = "flow.tensor.reshape"(%180) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
    %182 = "linalg.batch_matmul"(%172, %10, %181) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %183 = "flow.tensor.reshape"(%182, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<1x512x384xf32>, index, index) -> tensor<?x?x12x32xf32>
    %184 = "flow.tensor.reshape"(%175, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<1x512x384xf32>, index, index) -> tensor<?x?x12x32xf32>
    %185 = "linalg.init_tensor"() {static_sizes = [12, 32, 512]} : () -> tensor<12x32x512xf32>
    %186 = "linalg.generic"(%179, %185) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<12x32x512xf32>) -> tensor<12x32x512xf32>
    %187 = "linalg.init_tensor"() {static_sizes = [12, 512, 32]} : () -> tensor<12x512x32xf32>
    %188 = "linalg.generic"(%184, %187) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
    %189 = "linalg.init_tensor"() {static_sizes = [12, 512, 512]} : () -> tensor<12x512x512xf32>
    %190 = "linalg.fill"(%15, %189) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
    %191 = "linalg.generic"(%188, %186, %190) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x32xf32>, tensor<12x32x512xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
    %192 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
    "std.assert"(%192) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %193 = "linalg.generic"(%191, %145, %189) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.truncf"(%9) : (f64) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<?xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
    %194 = "linalg.init_tensor"() {static_sizes = [12, 512]} : () -> tensor<12x512xf32>
    %195 = "linalg.fill"(%16, %194) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<12x512xf32>) -> tensor<12x512xf32>
    %196 = "linalg.init_tensor"() {static_sizes = [12, 512]} : () -> tensor<12x512xi64>
    %197 = "linalg.fill"(%4, %196) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<12x512xi64>) -> tensor<12x512xi64>
    %198:2 = "linalg.generic"(%193, %195, %197) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
      %600 = "linalg.index"() {dim = 2 : i64} : () -> index
      %601 = "arith.index_cast"(%600) : (index) -> i64
      %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
      %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
      %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
      "linalg.yield"(%603, %604) : (f32, i64) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>, tensor<12x512xi64>) -> (tensor<12x512xf32>, tensor<12x512xi64>)
    %199 = "linalg.generic"(%193, %198#0, %189) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "math.exp"(%600) : (f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
    %200 = "linalg.fill"(%15, %194) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<12x512xf32>) -> tensor<12x512xf32>
    %201 = "linalg.generic"(%199, %200) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>) -> tensor<12x512xf32>
    %202 = "linalg.generic"(%199, %201, %189) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512xf32>, tensor<12x512x512xf32>) -> tensor<12x512x512xf32>
    %203 = "linalg.generic"(%183, %187) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
    %204 = "linalg.fill"(%15, %187) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
    %205 = "linalg.generic"(%202, %203, %204) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<12x512x512xf32>, tensor<12x512x32xf32>, tensor<12x512x32xf32>) -> tensor<12x512x32xf32>
    %206 = "linalg.init_tensor"() {static_sizes = [512, 12, 32]} : () -> tensor<512x12x32xf32>
    %207 = "linalg.generic"(%205, %206) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d0, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<12x512x32xf32>, tensor<512x12x32xf32>) -> tensor<512x12x32xf32>
    %208 = "flow.tensor.reshape"(%207, %0, %5) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<512x12x32xf32>, index, index) -> tensor<?x?x384xf32>
    %209 = "linalg.generic"(%31, %163) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<512x384xf32>) -> tensor<512x384xf32>
    %210 = "flow.tensor.reshape"(%209) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<512x384xf32>) -> tensor<1x512x384xf32>
    %211 = "linalg.batch_matmul"(%208, %8, %210) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %212 = "linalg.init_tensor"() {static_sizes = [1, 512, 384]} : () -> tensor<1x512x384xf32>
    %213 = "flow.tensor.reshape"(%211) {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (tensor<1x512x384xf32>) -> tensor<512x384xf32>
    %214 = "linalg.generic"(%213, %171, %212) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (0, d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<512x384xf32>, tensor<512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %215 = "linalg.init_tensor"() {static_sizes = [1, 512]} : () -> tensor<1x512xf32>
    %216 = "linalg.fill"(%15, %215) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %217 = "linalg.generic"(%214, %216) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %218 = "linalg.generic"(%217, %215) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %219 = "linalg.fill"(%15, %215) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %220 = "linalg.generic"(%214, %218, %219) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %221 = "linalg.generic"(%214, %218, %220, %33, %32, %212) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %222 = "linalg.init_tensor"() {static_sizes = [1, 512, 1536]} : () -> tensor<1x512x1536xf32>
    %223 = "linalg.init_tensor"() {static_sizes = [1, 384, 1536]} : () -> tensor<1x384x1536xf32>
    %224 = "linalg.generic"(%34, %222) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %225 = "linalg.generic"(%35, %223) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<1x384x1536xf32>) -> tensor<1x384x1536xf32>
    %226 = "linalg.batch_matmul"(%221, %225, %224) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %227 = "linalg.generic"(%226, %222) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.sqrt"(%18) : (f32) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "math.erf"(%601) : (f32) -> f32
      %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
      %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
      %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
      "linalg.yield"(%605) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %228 = "linalg.init_tensor"() {static_sizes = [1, 1536, 384]} : () -> tensor<1x1536x384xf32>
    %229 = "linalg.generic"(%36, %212) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %230 = "linalg.generic"(%37, %228) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<1x1536x384xf32>) -> tensor<1x1536x384xf32>
    %231 = "linalg.batch_matmul"(%227, %230, %229) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x1536x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %232 = "linalg.generic"(%231, %221, %212) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %233 = "linalg.fill"(%15, %215) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %234 = "linalg.generic"(%232, %233) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %235 = "linalg.generic"(%234, %215) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %236 = "linalg.fill"(%15, %215) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %237 = "linalg.generic"(%232, %235, %236) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %238 = "linalg.generic"(%232, %235, %237, %39, %38, %212) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %239 = "linalg.init_tensor"() {static_sizes = [1, 384, 384]} : () -> tensor<1x384x384xf32>
    %240 = "linalg.generic"(%40, %212) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %241 = "linalg.generic"(%41, %239) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %242 = "linalg.batch_matmul"(%238, %241, %240) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %243 = "tensor.cast"(%242) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %244 = "linalg.generic"(%42, %212) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %245 = "linalg.generic"(%43, %239) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %246 = "linalg.batch_matmul"(%238, %245, %244) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %247 = "tensor.cast"(%246) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %248 = "flow.tensor.reshape"(%247, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %249 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 32]} : () -> tensor<1x12x512x32xf32>
    %250 = "linalg.generic"(%44, %212) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %251 = "linalg.generic"(%45, %239) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %252 = "linalg.batch_matmul"(%238, %251, %250) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %253 = "tensor.cast"(%252) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %254 = "flow.tensor.reshape"(%253, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %255 = "linalg.generic"(%254, %249) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %256 = "flow.tensor.reshape"(%243, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %257 = "linalg.generic"(%256, %249) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %258 = "linalg.init_tensor"() {static_sizes = [1, 12, 32, 512]} : () -> tensor<1x12x32x512xf32>
    %259 = "linalg.generic"(%248, %258) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x32x512xf32>) -> tensor<1x12x32x512xf32>
    %260 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 512]} : () -> tensor<1x12x512x512xf32>
    %261 = "linalg.fill"(%15, %260) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %262 = "linalg.generic"(%257, %259, %261) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x12x32x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    "std.assert"(%192) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %263 = "flow.tensor.reshape"(%145, %147, %146) {operand_segment_sizes = dense<1> : vector<3xi32>} : (tensor<?xf32>, index, index) -> tensor<1x?xf32>
    %264 = "linalg.generic"(%262, %263, %260) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.truncf"(%9) : (f64) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x?xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %265 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xf32>
    %266 = "linalg.fill"(%16, %265) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %267 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xi64>
    %268 = "linalg.fill"(%4, %267) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<1x12x512xi64>) -> tensor<1x12x512xi64>
    %269:2 = "linalg.generic"(%264, %266, %268) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
      %600 = "linalg.index"() {dim = 3 : i64} : () -> index
      %601 = "arith.index_cast"(%600) : (index) -> i64
      %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
      %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
      %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
      "linalg.yield"(%603, %604) : (f32, i64) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512xi64>) -> (tensor<1x12x512xf32>, tensor<1x12x512xi64>)
    %270 = "linalg.generic"(%264, %269#0, %260) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "math.exp"(%600) : (f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %271 = "linalg.fill"(%15, %265) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %272 = "linalg.generic"(%270, %271) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %273 = "linalg.generic"(%270, %272, %260) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %274 = "linalg.fill"(%15, %249) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %275 = "linalg.generic"(%273, %255, %274) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %276 = "linalg.init_tensor"() {static_sizes = [1, 512, 12, 32]} : () -> tensor<1x512x12x32xf32>
    %277 = "linalg.generic"(%275, %276) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x512x12x32xf32>) -> tensor<1x512x12x32xf32>
    %278 = "tensor.cast"(%277) : (tensor<1x512x12x32xf32>) -> tensor<?x?x12x32xf32>
    %279 = "flow.tensor.reshape"(%278, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
    %280 = "linalg.generic"(%46, %212) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %281 = "linalg.generic"(%47, %239) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %282 = "linalg.batch_matmul"(%279, %281, %280) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %283 = "linalg.init_tensor"() {static_sizes = [0, 512, 384]} : () -> tensor<0x512x384xf32>
    %284 = "linalg.generic"(%282, %238, %283) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %285 = "linalg.init_tensor"() {static_sizes = [0, 512]} : () -> tensor<0x512xf32>
    %286 = "linalg.fill"(%15, %285) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %287 = "linalg.generic"(%284, %286) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %288 = "linalg.generic"(%287, %285) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %289 = "linalg.fill"(%15, %285) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %290 = "linalg.generic"(%284, %288, %289) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %291 = "linalg.generic"(%284, %288, %290, %49, %48, %283) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %292 = "linalg.init_tensor"() {static_sizes = [0, 512, 1536]} : () -> tensor<0x512x1536xf32>
    %293 = "linalg.init_tensor"() {static_sizes = [0, 384, 1536]} : () -> tensor<0x384x1536xf32>
    %294 = "linalg.generic"(%50, %292) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %295 = "linalg.generic"(%51, %293) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<0x384x1536xf32>) -> tensor<0x384x1536xf32>
    %296 = "linalg.batch_matmul"(%291, %295, %294) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %297 = "linalg.generic"(%296, %292) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.sqrt"(%18) : (f32) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "math.erf"(%601) : (f32) -> f32
      %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
      %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
      %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
      "linalg.yield"(%605) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %298 = "linalg.init_tensor"() {static_sizes = [0, 1536, 384]} : () -> tensor<0x1536x384xf32>
    %299 = "linalg.generic"(%52, %283) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %300 = "linalg.generic"(%53, %298) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<0x1536x384xf32>) -> tensor<0x1536x384xf32>
    %301 = "linalg.batch_matmul"(%297, %300, %299) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x1536x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %302 = "linalg.generic"(%301, %291, %283) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %303 = "linalg.fill"(%15, %285) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %304 = "linalg.generic"(%302, %303) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %305 = "linalg.generic"(%304, %285) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %306 = "linalg.fill"(%15, %285) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %307 = "linalg.generic"(%302, %305, %306) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %308 = "linalg.generic"(%302, %305, %307, %55, %54, %283) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %309 = "linalg.init_tensor"() {static_sizes = [0, 384, 384]} : () -> tensor<0x384x384xf32>
    %310 = "linalg.generic"(%56, %283) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %311 = "linalg.generic"(%57, %309) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %312 = "linalg.batch_matmul"(%308, %311, %310) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %313 = "tensor.cast"(%312) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
    %314 = "linalg.generic"(%58, %283) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %315 = "linalg.generic"(%59, %309) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %316 = "linalg.batch_matmul"(%308, %315, %314) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %317 = "tensor.cast"(%316) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
    %318 = "flow.tensor.reshape"(%317, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %319 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 32]} : () -> tensor<0x12x512x32xf32>
    %320 = "linalg.generic"(%60, %283) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %321 = "linalg.generic"(%61, %309) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %322 = "linalg.batch_matmul"(%308, %321, %320) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %323 = "tensor.cast"(%322) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
    %324 = "flow.tensor.reshape"(%323, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %325 = "linalg.generic"(%324, %319) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %326 = "flow.tensor.reshape"(%313, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %327 = "linalg.generic"(%326, %319) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %328 = "linalg.init_tensor"() {static_sizes = [0, 12, 32, 512]} : () -> tensor<0x12x32x512xf32>
    %329 = "linalg.generic"(%318, %328) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x32x512xf32>) -> tensor<0x12x32x512xf32>
    %330 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 512]} : () -> tensor<0x12x512x512xf32>
    %331 = "linalg.fill"(%15, %330) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %332 = "linalg.generic"(%327, %329, %331) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x12x32x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    "std.assert"(%192) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %333 = "linalg.generic"(%332, %263, %330) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.truncf"(%9) : (f64) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<1x?xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %334 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xf32>
    %335 = "linalg.fill"(%16, %334) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
    %336 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xi64>
    %337 = "linalg.fill"(%4, %336) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<0x12x512xi64>) -> tensor<0x12x512xi64>
    %338:2 = "linalg.generic"(%333, %335, %337) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
      %600 = "linalg.index"() {dim = 3 : i64} : () -> index
      %601 = "arith.index_cast"(%600) : (index) -> i64
      %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
      %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
      %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
      "linalg.yield"(%603, %604) : (f32, i64) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512xi64>) -> (tensor<0x12x512xf32>, tensor<0x12x512xi64>)
    %339 = "linalg.generic"(%333, %338#0, %330) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "math.exp"(%600) : (f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %340 = "linalg.fill"(%15, %334) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
    %341 = "linalg.generic"(%339, %340) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
    %342 = "linalg.generic"(%339, %341, %330) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %343 = "linalg.fill"(%15, %319) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %344 = "linalg.generic"(%342, %325, %343) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %345 = "linalg.init_tensor"() {static_sizes = [0, 512, 12, 32]} : () -> tensor<0x512x12x32xf32>
    %346 = "linalg.generic"(%344, %345) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x512x12x32xf32>) -> tensor<0x512x12x32xf32>
    %347 = "tensor.cast"(%346) : (tensor<0x512x12x32xf32>) -> tensor<?x?x12x32xf32>
    %348 = "flow.tensor.reshape"(%347, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
    %349 = "linalg.generic"(%62, %283) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %350 = "linalg.generic"(%63, %309) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %351 = "linalg.batch_matmul"(%348, %350, %349) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %352 = "linalg.init_tensor"() {static_sizes = [1, 512, 384]} : () -> tensor<1x512x384xf32>
    %353 = "linalg.generic"(%351, %308, %352) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %354 = "linalg.init_tensor"() {static_sizes = [1, 512]} : () -> tensor<1x512xf32>
    %355 = "linalg.fill"(%15, %354) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %356 = "linalg.generic"(%353, %355) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %357 = "linalg.generic"(%356, %354) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %358 = "linalg.fill"(%15, %354) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %359 = "linalg.generic"(%353, %357, %358) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %360 = "linalg.generic"(%353, %357, %359, %65, %64, %352) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %361 = "linalg.init_tensor"() {static_sizes = [1, 512, 1536]} : () -> tensor<1x512x1536xf32>
    %362 = "linalg.init_tensor"() {static_sizes = [1, 384, 1536]} : () -> tensor<1x384x1536xf32>
    %363 = "linalg.generic"(%66, %361) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %364 = "linalg.generic"(%67, %362) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<1x384x1536xf32>) -> tensor<1x384x1536xf32>
    %365 = "linalg.batch_matmul"(%360, %364, %363) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %366 = "linalg.generic"(%365, %361) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.sqrt"(%18) : (f32) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "math.erf"(%601) : (f32) -> f32
      %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
      %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
      %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
      "linalg.yield"(%605) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %367 = "linalg.init_tensor"() {static_sizes = [1, 1536, 384]} : () -> tensor<1x1536x384xf32>
    %368 = "linalg.generic"(%68, %352) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %369 = "linalg.generic"(%69, %367) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<1x1536x384xf32>) -> tensor<1x1536x384xf32>
    %370 = "linalg.batch_matmul"(%366, %369, %368) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x1536x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %371 = "linalg.generic"(%370, %360, %352) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %372 = "linalg.fill"(%15, %354) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %373 = "linalg.generic"(%371, %372) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %374 = "linalg.generic"(%373, %354) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %375 = "linalg.fill"(%15, %354) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %376 = "linalg.generic"(%371, %374, %375) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %377 = "linalg.generic"(%371, %374, %376, %71, %70, %352) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %378 = "linalg.init_tensor"() {static_sizes = [1, 384, 384]} : () -> tensor<1x384x384xf32>
    %379 = "linalg.generic"(%72, %352) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %380 = "linalg.generic"(%73, %378) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %381 = "linalg.batch_matmul"(%377, %380, %379) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %382 = "tensor.cast"(%381) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %383 = "linalg.generic"(%74, %352) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %384 = "linalg.generic"(%75, %378) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %385 = "linalg.batch_matmul"(%377, %384, %383) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %386 = "tensor.cast"(%385) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %387 = "flow.tensor.reshape"(%386, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %388 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 32]} : () -> tensor<1x12x512x32xf32>
    %389 = "linalg.generic"(%76, %352) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %390 = "linalg.generic"(%77, %378) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %391 = "linalg.batch_matmul"(%377, %390, %389) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %392 = "tensor.cast"(%391) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %393 = "flow.tensor.reshape"(%392, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %394 = "linalg.generic"(%393, %388) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %395 = "flow.tensor.reshape"(%382, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %396 = "linalg.generic"(%395, %388) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %397 = "linalg.init_tensor"() {static_sizes = [1, 12, 32, 512]} : () -> tensor<1x12x32x512xf32>
    %398 = "linalg.generic"(%387, %397) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x32x512xf32>) -> tensor<1x12x32x512xf32>
    %399 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 512]} : () -> tensor<1x12x512x512xf32>
    %400 = "linalg.fill"(%15, %399) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %401 = "linalg.generic"(%396, %398, %400) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x12x32x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %402 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
    "std.assert"(%402) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %403 = "linalg.generic"(%401, %263, %399) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.truncf"(%9) : (f64) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x?xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %404 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xf32>
    %405 = "linalg.fill"(%16, %404) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %406 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xi64>
    %407 = "linalg.fill"(%4, %406) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<1x12x512xi64>) -> tensor<1x12x512xi64>
    %408:2 = "linalg.generic"(%403, %405, %407) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
      %600 = "linalg.index"() {dim = 3 : i64} : () -> index
      %601 = "arith.index_cast"(%600) : (index) -> i64
      %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
      %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
      %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
      "linalg.yield"(%603, %604) : (f32, i64) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512xi64>) -> (tensor<1x12x512xf32>, tensor<1x12x512xi64>)
    %409 = "linalg.generic"(%403, %408#0, %399) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "math.exp"(%600) : (f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %410 = "linalg.fill"(%15, %404) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %411 = "linalg.generic"(%409, %410) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %412 = "linalg.generic"(%409, %411, %399) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %413 = "linalg.fill"(%15, %388) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %414 = "linalg.generic"(%412, %394, %413) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %415 = "linalg.init_tensor"() {static_sizes = [1, 512, 12, 32]} : () -> tensor<1x512x12x32xf32>
    %416 = "linalg.generic"(%414, %415) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x512x12x32xf32>) -> tensor<1x512x12x32xf32>
    %417 = "tensor.cast"(%416) : (tensor<1x512x12x32xf32>) -> tensor<?x?x12x32xf32>
    %418 = "flow.tensor.reshape"(%417, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
    %419 = "linalg.generic"(%78, %352) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %420 = "linalg.generic"(%79, %378) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %421 = "linalg.batch_matmul"(%418, %420, %419) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %422 = "linalg.init_tensor"() {static_sizes = [0, 512, 384]} : () -> tensor<0x512x384xf32>
    %423 = "linalg.generic"(%421, %377, %422) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %424 = "linalg.init_tensor"() {static_sizes = [0, 512]} : () -> tensor<0x512xf32>
    %425 = "linalg.fill"(%15, %424) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %426 = "linalg.generic"(%423, %425) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %427 = "linalg.generic"(%426, %424) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %428 = "linalg.fill"(%15, %424) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %429 = "linalg.generic"(%423, %427, %428) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %430 = "linalg.generic"(%423, %427, %429, %81, %80, %422) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %431 = "linalg.init_tensor"() {static_sizes = [0, 512, 1536]} : () -> tensor<0x512x1536xf32>
    %432 = "linalg.init_tensor"() {static_sizes = [0, 384, 1536]} : () -> tensor<0x384x1536xf32>
    %433 = "linalg.generic"(%82, %431) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %434 = "linalg.generic"(%83, %432) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<0x384x1536xf32>) -> tensor<0x384x1536xf32>
    %435 = "linalg.batch_matmul"(%430, %434, %433) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %436 = "linalg.generic"(%435, %431) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.sqrt"(%18) : (f32) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "math.erf"(%601) : (f32) -> f32
      %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
      %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
      %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
      "linalg.yield"(%605) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %437 = "linalg.init_tensor"() {static_sizes = [0, 1536, 384]} : () -> tensor<0x1536x384xf32>
    %438 = "linalg.generic"(%84, %422) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %439 = "linalg.generic"(%85, %437) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<0x1536x384xf32>) -> tensor<0x1536x384xf32>
    %440 = "linalg.batch_matmul"(%436, %439, %438) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x1536x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %441 = "linalg.generic"(%440, %430, %422) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %442 = "linalg.fill"(%15, %424) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %443 = "linalg.generic"(%441, %442) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %444 = "linalg.generic"(%443, %424) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %445 = "linalg.fill"(%15, %424) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %446 = "linalg.generic"(%441, %444, %445) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %447 = "linalg.generic"(%441, %444, %446, %87, %86, %422) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %448 = "linalg.init_tensor"() {static_sizes = [0, 384, 384]} : () -> tensor<0x384x384xf32>
    %449 = "linalg.generic"(%88, %422) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %450 = "linalg.generic"(%89, %448) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %451 = "linalg.batch_matmul"(%447, %450, %449) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %452 = "tensor.cast"(%451) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
    %453 = "linalg.generic"(%90, %422) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %454 = "linalg.generic"(%91, %448) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %455 = "linalg.batch_matmul"(%447, %454, %453) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %456 = "tensor.cast"(%455) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
    %457 = "flow.tensor.reshape"(%456, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %458 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 32]} : () -> tensor<0x12x512x32xf32>
    %459 = "linalg.generic"(%92, %422) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %460 = "linalg.generic"(%93, %448) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %461 = "linalg.batch_matmul"(%447, %460, %459) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %462 = "tensor.cast"(%461) : (tensor<0x512x384xf32>) -> tensor<?x?x384xf32>
    %463 = "flow.tensor.reshape"(%462, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %464 = "linalg.generic"(%463, %458) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %465 = "flow.tensor.reshape"(%452, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %466 = "linalg.generic"(%465, %458) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %467 = "linalg.init_tensor"() {static_sizes = [0, 12, 32, 512]} : () -> tensor<0x12x32x512xf32>
    %468 = "linalg.generic"(%457, %467) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<0x12x32x512xf32>) -> tensor<0x12x32x512xf32>
    %469 = "linalg.init_tensor"() {static_sizes = [0, 12, 512, 512]} : () -> tensor<0x12x512x512xf32>
    %470 = "linalg.fill"(%15, %469) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %471 = "linalg.generic"(%466, %468, %470) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x12x32x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %472 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
    "std.assert"(%472) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %473 = "linalg.generic"(%471, %263, %469) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.truncf"(%9) : (f64) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<1x?xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %474 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xf32>
    %475 = "linalg.fill"(%16, %474) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
    %476 = "linalg.init_tensor"() {static_sizes = [0, 12, 512]} : () -> tensor<0x12x512xi64>
    %477 = "linalg.fill"(%4, %476) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<0x12x512xi64>) -> tensor<0x12x512xi64>
    %478:2 = "linalg.generic"(%473, %475, %477) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
      %600 = "linalg.index"() {dim = 3 : i64} : () -> index
      %601 = "arith.index_cast"(%600) : (index) -> i64
      %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
      %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
      %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
      "linalg.yield"(%603, %604) : (f32, i64) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512xi64>) -> (tensor<0x12x512xf32>, tensor<0x12x512xi64>)
    %479 = "linalg.generic"(%473, %478#0, %469) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "math.exp"(%600) : (f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %480 = "linalg.fill"(%15, %474) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
    %481 = "linalg.generic"(%479, %480) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>) -> tensor<0x12x512xf32>
    %482 = "linalg.generic"(%479, %481, %469) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512xf32>, tensor<0x12x512x512xf32>) -> tensor<0x12x512x512xf32>
    %483 = "linalg.fill"(%15, %458) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %484 = "linalg.generic"(%482, %464, %483) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x12x512x512xf32>, tensor<0x12x512x32xf32>, tensor<0x12x512x32xf32>) -> tensor<0x12x512x32xf32>
    %485 = "linalg.init_tensor"() {static_sizes = [0, 512, 12, 32]} : () -> tensor<0x512x12x32xf32>
    %486 = "linalg.generic"(%484, %485) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x12x512x32xf32>, tensor<0x512x12x32xf32>) -> tensor<0x512x12x32xf32>
    %487 = "tensor.cast"(%486) : (tensor<0x512x12x32xf32>) -> tensor<?x?x12x32xf32>
    %488 = "flow.tensor.reshape"(%487, %2, %5, %2, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
    %489 = "linalg.generic"(%94, %422) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %490 = "linalg.generic"(%95, %448) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<0x384x384xf32>) -> tensor<0x384x384xf32>
    %491 = "linalg.batch_matmul"(%488, %490, %489) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<0x384x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %492 = "linalg.init_tensor"() {static_sizes = [1, 512, 384]} : () -> tensor<1x512x384xf32>
    %493 = "linalg.generic"(%491, %447, %492) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %494 = "linalg.init_tensor"() {static_sizes = [1, 512]} : () -> tensor<1x512xf32>
    %495 = "linalg.fill"(%15, %494) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %496 = "linalg.generic"(%493, %495) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %497 = "linalg.generic"(%496, %494) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %498 = "linalg.fill"(%15, %494) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %499 = "linalg.generic"(%493, %497, %498) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %500 = "linalg.generic"(%493, %497, %499, %97, %96, %492) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %501 = "linalg.init_tensor"() {static_sizes = [1, 512, 1536]} : () -> tensor<1x512x1536xf32>
    %502 = "linalg.init_tensor"() {static_sizes = [1, 384, 1536]} : () -> tensor<1x384x1536xf32>
    %503 = "linalg.generic"(%98, %501) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %504 = "linalg.generic"(%99, %502) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<1x384x1536xf32>) -> tensor<1x384x1536xf32>
    %505 = "linalg.batch_matmul"(%500, %504, %503) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %506 = "linalg.generic"(%505, %501) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.sqrt"(%18) : (f32) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "math.erf"(%601) : (f32) -> f32
      %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
      %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
      %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
      "linalg.yield"(%605) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x512x1536xf32>) -> tensor<1x512x1536xf32>
    %507 = "linalg.init_tensor"() {static_sizes = [1, 1536, 384]} : () -> tensor<1x1536x384xf32>
    %508 = "linalg.generic"(%100, %492) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %509 = "linalg.generic"(%101, %507) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<1x1536x384xf32>) -> tensor<1x1536x384xf32>
    %510 = "linalg.batch_matmul"(%506, %509, %508) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x1536xf32>, tensor<1x1536x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %511 = "linalg.generic"(%510, %500, %492) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %512 = "linalg.fill"(%15, %494) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %513 = "linalg.generic"(%511, %512) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %514 = "linalg.generic"(%513, %494) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %515 = "linalg.fill"(%15, %494) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x512xf32>) -> tensor<1x512xf32>
    %516 = "linalg.generic"(%511, %514, %515) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
    %517 = "linalg.generic"(%511, %514, %516, %103, %102, %492) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512xf32>, tensor<1x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %518 = "linalg.init_tensor"() {static_sizes = [1, 384, 384]} : () -> tensor<1x384x384xf32>
    %519 = "linalg.generic"(%104, %492) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %520 = "linalg.generic"(%105, %518) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %521 = "linalg.batch_matmul"(%517, %520, %519) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %522 = "tensor.cast"(%521) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %523 = "linalg.generic"(%106, %492) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %524 = "linalg.generic"(%107, %518) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %525 = "linalg.batch_matmul"(%517, %524, %523) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %526 = "tensor.cast"(%525) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %527 = "flow.tensor.reshape"(%526, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %528 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 32]} : () -> tensor<1x12x512x32xf32>
    %529 = "linalg.generic"(%108, %492) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %530 = "linalg.generic"(%109, %518) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %531 = "linalg.batch_matmul"(%517, %530, %529) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %532 = "tensor.cast"(%531) : (tensor<1x512x384xf32>) -> tensor<?x?x384xf32>
    %533 = "flow.tensor.reshape"(%532, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %534 = "linalg.generic"(%533, %528) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %535 = "flow.tensor.reshape"(%522, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index, index) -> tensor<?x?x12x32xf32>
    %536 = "linalg.generic"(%535, %528) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %537 = "linalg.init_tensor"() {static_sizes = [1, 12, 32, 512]} : () -> tensor<1x12x32x512xf32>
    %538 = "linalg.generic"(%527, %537) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?x12x32xf32>, tensor<1x12x32x512xf32>) -> tensor<1x12x32x512xf32>
    %539 = "linalg.init_tensor"() {static_sizes = [1, 12, 512, 512]} : () -> tensor<1x12x512x512xf32>
    %540 = "linalg.fill"(%15, %539) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %541 = "linalg.generic"(%536, %538, %540) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x12x32x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %542 = "arith.cmpi"(%5, %142) {predicate = 0 : i64} : (index, index) -> i1
    "std.assert"(%542) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %543 = "linalg.generic"(%541, %263, %539) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.truncf"(%9) : (f64) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%601, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x?xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %544 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xf32>
    %545 = "linalg.fill"(%16, %544) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %546 = "linalg.init_tensor"() {static_sizes = [1, 12, 512]} : () -> tensor<1x12x512xi64>
    %547 = "linalg.fill"(%4, %546) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<1x12x512xi64>) -> tensor<1x12x512xi64>
    %548:2 = "linalg.generic"(%543, %545, %547) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: i64):
      %600 = "linalg.index"() {dim = 3 : i64} : () -> index
      %601 = "arith.index_cast"(%600) : (index) -> i64
      %602 = "arith.cmpf"(%arg1, %arg2) {predicate = 2 : i64} : (f32, f32) -> i1
      %603 = "std.select"(%602, %arg1, %arg2) : (i1, f32, f32) -> f32
      %604 = "std.select"(%602, %601, %arg3) : (i1, i64, i64) -> i64
      "linalg.yield"(%603, %604) : (f32, i64) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[1, 2]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512xi64>) -> (tensor<1x12x512xf32>, tensor<1x12x512xi64>)
    %549 = "linalg.generic"(%543, %548#0, %539) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "math.exp"(%600) : (f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %550 = "linalg.fill"(%15, %544) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %551 = "linalg.generic"(%549, %550) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>) -> tensor<1x12x512xf32>
    %552 = "linalg.generic"(%549, %551, %539) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.divf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %553 = "linalg.fill"(%15, %528) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %554 = "linalg.generic"(%552, %534, %553) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%600, %arg3) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x12x512x512xf32>, tensor<1x12x512x32xf32>, tensor<1x12x512x32xf32>) -> tensor<1x12x512x32xf32>
    %555 = "linalg.init_tensor"() {static_sizes = [1, 512, 12, 32]} : () -> tensor<1x512x12x32xf32>
    %556 = "linalg.generic"(%554, %555) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1x12x512x32xf32>, tensor<1x512x12x32xf32>) -> tensor<1x512x12x32xf32>
    %557 = "tensor.cast"(%556) : (tensor<1x512x12x32xf32>) -> tensor<?x?x12x32xf32>
    %558 = "flow.tensor.reshape"(%557, %0, %5, %0, %5) {operand_segment_sizes = dense<[1, 2, 2]> : vector<3xi32>} : (tensor<?x?x12x32xf32>, index, index, index, index) -> tensor<?x?x384xf32>
    %559 = "linalg.generic"(%110, %492) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    %560 = "linalg.generic"(%111, %518) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
    %561 = "linalg.batch_matmul"(%558, %560, %559) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x?x384xf32>, tensor<1x384x384xf32>, tensor<1x512x384xf32>) -> tensor<1x512x384xf32>
    "std.assert"(%1) {msg = "mismatched size for broadcast"} : (i1) -> ()
    %562 = "linalg.init_tensor"() {static_sizes = [0, 512, 384]} : () -> tensor<0x512x384xf32>
    %563 = "linalg.generic"(%561, %517, %562) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<1x512x384xf32>, tensor<1x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %564 = "linalg.init_tensor"() {static_sizes = [0, 512]} : () -> tensor<0x512xf32>
    %565 = "linalg.fill"(%15, %564) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %566 = "linalg.generic"(%563, %565) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %567 = "linalg.generic"(%566, %564) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %568 = "linalg.fill"(%15, %564) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %569 = "linalg.generic"(%563, %567, %568) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %570 = "linalg.generic"(%563, %567, %569, %113, %112, %562) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %571 = "linalg.init_tensor"() {static_sizes = [0, 512, 1536]} : () -> tensor<0x512x1536xf32>
    %572 = "linalg.init_tensor"() {static_sizes = [0, 384, 1536]} : () -> tensor<0x384x1536xf32>
    %573 = "linalg.generic"(%114, %571) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %574 = "linalg.generic"(%115, %572) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<1536x384xf32>, tensor<0x384x1536xf32>) -> tensor<0x384x1536xf32>
    %575 = "linalg.batch_matmul"(%570, %574, %573) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x384x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %576 = "linalg.generic"(%575, %571) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.sqrt"(%18) : (f32) -> f32
      %601 = "arith.divf"(%arg1, %600) : (f32, f32) -> f32
      %602 = "math.erf"(%601) : (f32) -> f32
      %603 = "arith.addf"(%602, %19) : (f32, f32) -> f32
      %604 = "arith.mulf"(%603, %17) : (f32, f32) -> f32
      %605 = "arith.mulf"(%arg1, %604) : (f32, f32) -> f32
      "linalg.yield"(%605) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x512x1536xf32>) -> tensor<0x512x1536xf32>
    %577 = "linalg.init_tensor"() {static_sizes = [0, 1536, 384]} : () -> tensor<0x1536x384xf32>
    %578 = "linalg.generic"(%116, %562) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %579 = "linalg.generic"(%117, %577) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384x1536xf32>, tensor<0x1536x384xf32>) -> tensor<0x1536x384xf32>
    %580 = "linalg.batch_matmul"(%576, %579, %578) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x1536xf32>, tensor<0x1536x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %581 = "linalg.generic"(%580, %570, %562) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.addf"(%arg1, %arg2) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512x384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %582 = "linalg.fill"(%15, %564) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %583 = "linalg.generic"(%581, %582) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.addf"(%arg2, %arg1) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %584 = "linalg.generic"(%583, %564) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "arith.divf"(%arg1, %122) : (f32, f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %585 = "linalg.fill"(%15, %564) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) : (f32, tensor<0x512xf32>) -> tensor<0x512xf32>
    %586 = "linalg.generic"(%581, %584, %585) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.mulf"(%600, %600) : (f32, f32) -> f32
      %602 = "arith.addf"(%arg3, %601) : (f32, f32) -> f32
      "linalg.yield"(%602) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>) -> tensor<0x512xf32>
    %587 = "linalg.generic"(%581, %584, %586, %119, %118, %562) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
      %600 = "arith.divf"(%arg3, %122) : (f32, f32) -> f32
      %601 = "arith.subf"(%arg1, %arg2) : (f32, f32) -> f32
      %602 = "arith.truncf"(%21) : (f64) -> f32
      %603 = "arith.addf"(%600, %602) : (f32, f32) -> f32
      %604 = "math.rsqrt"(%603) : (f32) -> f32
      %605 = "arith.mulf"(%601, %604) : (f32, f32) -> f32
      %606 = "arith.mulf"(%605, %arg4) : (f32, f32) -> f32
      %607 = "arith.addf"(%606, %arg5) : (f32, f32) -> f32
      "linalg.yield"(%607) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"], operand_segment_sizes = dense<[5, 1]> : vector<2xi32>} : (tensor<0x512x384xf32>, tensor<0x512xf32>, tensor<0x512xf32>, tensor<384xf32>, tensor<384xf32>, tensor<0x512x384xf32>) -> tensor<0x512x384xf32>
    %588 = "tensor.extract_slice"(%587) {operand_segment_sizes = dense<[1, 0, 0, 0]> : vector<4xi32>, static_offsets = [0, 0, 0], static_sizes = [0, 1, 384], static_strides = [1, 1, 1]} : (tensor<0x512x384xf32>) -> tensor<0x1x384xf32>
    %589 = "tensor.cast"(%588) : (tensor<0x1x384xf32>) -> tensor<?x?x384xf32>
    %590 = "flow.tensor.reshape"(%589, %2, %0, %2) {operand_segment_sizes = dense<[1, 2, 1]> : vector<3xi32>} : (tensor<?x?x384xf32>, index, index, index) -> tensor<?x384xf32>
    %591 = "linalg.init_tensor"() {static_sizes = [0, 384]} : () -> tensor<0x384xf32>
    %592 = "linalg.generic"(%120, %591) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<384xf32>, tensor<0x384xf32>) -> tensor<0x384xf32>
    %593 = "linalg.matmul"(%590, %7, %592) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x384xf32>, tensor<384x384xf32>, tensor<0x384xf32>) -> tensor<0x384xf32>
    %594 = "linalg.generic"(%593, %591) ({
    ^bb0(%arg1: f32, %arg2: f32):
      %600 = "math.tanh"(%arg1) : (f32) -> f32
      "linalg.yield"(%600) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<0x384xf32>, tensor<0x384xf32>) -> tensor<0x384xf32>
    %595 = "linalg.init_tensor"() {static_sizes = [0, 2]} : () -> tensor<0x2xf32>
    %596 = "linalg.generic"(%121, %595) ({
    ^bb0(%arg1: f32, %arg2: f32):
      "linalg.yield"(%arg1) : (f32) -> ()
    }) {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<2xf32>, tensor<0x2xf32>) -> tensor<0x2xf32>
    %597 = "linalg.matmul"(%594, %6, %596) ({
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %600 = "arith.mulf"(%arg1, %arg2) : (f32, f32) -> f32
      %601 = "arith.addf"(%arg3, %600) : (f32, f32) -> f32
      "linalg.yield"(%601) : (f32) -> ()
    }) {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<0x384xf32>, tensor<384x2xf32>, tensor<0x2xf32>) -> tensor<0x2xf32>
    %598 = "tensor.cast"(%597) : (tensor<0x2xf32>) -> tensor<?x2xf32>
    %599 = "hal.tensor.export"(%598, %2) {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>, source_encoding = tensor<?x2xf32>} : (tensor<?x2xf32>, index) -> !hal.buffer_view
    "std.return"(%599) : (!hal.buffer_view) -> ()
  }) {iree.abi.stub, sym_name = "forward", type = (!hal.buffer_view) -> !hal.buffer_view} : () -> ()
}) {torch.debug_module_name = "MiniLMSequenceClassification"} : () -> ()