stellaraccident/bert_training.mlir

## bert_training.mlir
// -----// IR Dump Before mlir::iree_integrations::TF::LowerGlobalTensors //----- //
module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 779 : i32}, tf_saved_model.semantics}  {
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node15__optimizer.iter", tf_saved_model.exported_names = [], type = tensor<i64>, value = dense<0> : tensor<i64>} : () -> ()
  "tf_saved_model.global_tensor"() {sym_name = "__sm_node17__optimizer.learning_rate", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<0.00999999977> : tensor<f32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node45__m.layer-3.layer-1.embeddings", tf_saved_model.exported_names = [], type = tensor<100x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<100x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node46__m.layer-3.layer-3.embeddings", tf_saved_model.exported_names = [], type = tensor<512x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<512x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node47__m.layer-3.layer-4.embeddings", tf_saved_model.exported_names = [], type = tensor<16x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<16x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node48__m.layer-3.layer-6.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node49__m.layer-3.layer-6.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node50__m.layer-3.layer-10._attention_layer._query_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node51__m.layer-3.layer-10._attention_layer._query_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node52__m.layer-3.layer-10._attention_layer._key_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node53__m.layer-3.layer-10._attention_layer._key_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node54__m.layer-3.layer-10._attention_layer._value_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node55__m.layer-3.layer-10._attention_layer._value_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node56__m.layer-3.layer-10._attention_layer._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<12x64x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<12x64x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node57__m.layer-3.layer-10._attention_layer._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node58__m.layer-3.layer-10.keras_api.layers.2.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node59__m.layer-3.layer-10.keras_api.layers.2.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node60__m.layer-3.layer-10.keras_api.layers.3.kernel", tf_saved_model.exported_names = [], type = tensor<768x3072xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x3072xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node61__m.layer-3.layer-10.keras_api.layers.3.bias", tf_saved_model.exported_names = [], type = tensor<3072xf32>, value = dense<0.000000e+00> : tensor<3072xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node62__m.layer-3.layer-10._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<3072x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<3072x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node63__m.layer-3.layer-10._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node64__m.layer-3.layer-10._output_layer_norm.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node65__m.layer-3.layer-10._output_layer_norm.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node66__m.layer-3.layer-11._attention_layer._query_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node67__m.layer-3.layer-11._attention_layer._query_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node68__m.layer-3.layer-11._attention_layer._key_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node69__m.layer-3.layer-11._attention_layer._key_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node70__m.layer-3.layer-11._attention_layer._value_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node71__m.layer-3.layer-11._attention_layer._value_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node72__m.layer-3.layer-11._attention_layer._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<12x64x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<12x64x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node73__m.layer-3.layer-11._attention_layer._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node74__m.layer-3.layer-11.keras_api.layers.2.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node75__m.layer-3.layer-11.keras_api.layers.2.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node76__m.layer-3.layer-11.keras_api.layers.3.kernel", tf_saved_model.exported_names = [], type = tensor<768x3072xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x3072xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node77__m.layer-3.layer-11.keras_api.layers.3.bias", tf_saved_model.exported_names = [], type = tensor<3072xf32>, value = dense<0.000000e+00> : tensor<3072xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node78__m.layer-3.layer-11._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<3072x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<3072x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node79__m.layer-3.layer-11._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node80__m.layer-3.layer-11._output_layer_norm.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node81__m.layer-3.layer-11._output_layer_norm.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node82__m.layer-3.layer-13.kernel", tf_saved_model.exported_names = [], type = tensor<768x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node83__m.layer-3.layer-13.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node84__m.layer-5.out_proj.kernel", tf_saved_model.exported_names = [], type = tensor<768x5xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x5xf32>} : () -> ()
  "tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node85__m.layer-5.out_proj.bias", tf_saved_model.exported_names = [], type = tensor<5xf32>, value = dense<0.000000e+00> : tensor<5xf32>} : () -> ()
  func @__inference_learn_29190(%arg0: tensor<1x512xi32> {tf._user_specified_name = "inputs", tf_saved_model.index_path = [0, 0, 0]}, %arg1: tensor<1x512xi32> {tf._user_specified_name = "inputs", tf_saved_model.index_path = [0, 0, 1]}, %arg2: tensor<1x512xi32> {tf._user_specified_name = "inputs", tf_saved_model.index_path = [0, 0, 2]}, %arg3: tensor<1xi32> {tf._user_specified_name = "labels", tf_saved_model.index_path = [1]}, %arg4: tensor<!tf.resource<tensor<100x768xf32>>> {tf_saved_model.bound_input = @"__sm_node45__m.layer-3.layer-1.embeddings"}, %arg5: tensor<!tf.resource<tensor<512x768xf32>>> {tf_saved_model.bound_input = @"__sm_node46__m.layer-3.layer-3.embeddings"}, %arg6: tensor<!tf.resource<tensor<16x768xf32>>> {tf_saved_model.bound_input = @"__sm_node47__m.layer-3.layer-4.embeddings"}, %arg7: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node48__m.layer-3.layer-6.gamma"}, %arg8: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node49__m.layer-3.layer-6.beta"}, %arg9: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node50__m.layer-3.layer-10._attention_layer._query_dense.kernel"}, %arg10: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node51__m.layer-3.layer-10._attention_layer._query_dense.bias"}, %arg11: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node52__m.layer-3.layer-10._attention_layer._key_dense.kernel"}, %arg12: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node53__m.layer-3.layer-10._attention_layer._key_dense.bias"}, %arg13: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node54__m.layer-3.layer-10._attention_layer._value_dense.kernel"}, %arg14: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node55__m.layer-3.layer-10._attention_layer._value_dense.bias"}, %arg15: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node56__m.layer-3.layer-10._attention_layer._output_dense.kernel"}, %arg16: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node57__m.layer-3.layer-10._attention_layer._output_dense.bias"}, %arg17: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node58__m.layer-3.layer-10.keras_api.layers.2.gamma"}, %arg18: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node59__m.layer-3.layer-10.keras_api.layers.2.beta"}, %arg19: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node60__m.layer-3.layer-10.keras_api.layers.3.kernel"}, %arg20: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node61__m.layer-3.layer-10.keras_api.layers.3.bias"}, %arg21: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node62__m.layer-3.layer-10._output_dense.kernel"}, %arg22: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node63__m.layer-3.layer-10._output_dense.bias"}, %arg23: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node64__m.layer-3.layer-10._output_layer_norm.gamma"}, %arg24: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node65__m.layer-3.layer-10._output_layer_norm.beta"}, %arg25: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node66__m.layer-3.layer-11._attention_layer._query_dense.kernel"}, %arg26: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node67__m.layer-3.layer-11._attention_layer._query_dense.bias"}, %arg27: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node68__m.layer-3.layer-11._attention_layer._key_dense.kernel"}, %arg28: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node69__m.layer-3.layer-11._attention_layer._key_dense.bias"}, %arg29: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node70__m.layer-3.layer-11._attention_layer._value_dense.kernel"}, %arg30: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node71__m.layer-3.layer-11._attention_layer._value_dense.bias"}, %arg31: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node72__m.layer-3.layer-11._attention_layer._output_dense.kernel"}, %arg32: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node73__m.layer-3.layer-11._attention_layer._output_dense.bias"}, %arg33: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node74__m.layer-3.layer-11.keras_api.layers.2.gamma"}, %arg34: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node75__m.layer-3.layer-11.keras_api.layers.2.beta"}, %arg35: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node76__m.layer-3.layer-11.keras_api.layers.3.kernel"}, %arg36: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node77__m.layer-3.layer-11.keras_api.layers.3.bias"}, %arg37: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node78__m.layer-3.layer-11._output_dense.kernel"}, %arg38: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node79__m.layer-3.layer-11._output_dense.bias"}, %arg39: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node80__m.layer-3.layer-11._output_layer_norm.gamma"}, %arg40: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node81__m.layer-3.layer-11._output_layer_norm.beta"}, %arg41: tensor<!tf.resource<tensor<768x768xf32>>> {tf_saved_model.bound_input = @"__sm_node82__m.layer-3.layer-13.kernel"}, %arg42: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node83__m.layer-3.layer-13.bias"}, %arg43: tensor<!tf.resource<tensor<768x5xf32>>> {tf_saved_model.bound_input = @"__sm_node84__m.layer-5.out_proj.kernel"}, %arg44: tensor<!tf.resource<tensor<5xf32>>> {tf_saved_model.bound_input = @"__sm_node85__m.layer-5.out_proj.bias"}, %arg45: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @__sm_node17__optimizer.learning_rate}, %arg46: tensor<!tf.resource<tensor<i64>>> {tf_saved_model.bound_input = @__sm_node15__optimizer.iter}) -> (tensor<f32> {tf_saved_model.index_path = []}) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<1>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful, tf_saved_model.exported_names = ["learn"]} {
    %0 = mhlo.constant dense<1> : tensor<i64>
    %1 = mhlo.constant dense<2.000000e+00> : tensor<f32>
    %2 = mhlo.constant dense<1.000000e-07> : tensor<f32>
    %3 = mhlo.constant dense<0.000000e+00> : tensor<1x5xf32>
    %4 = mhlo.constant dense<-1.000000e+09> : tensor<f32>
    %5 = mhlo.constant dense<1.250000e-01> : tensor<f32>
    %6 = mhlo.constant dense<0.797884583> : tensor<f32>
    %7 = mhlo.constant dense<5.000000e-01> : tensor<f32>
    %8 = mhlo.constant dense<3.000000e+00> : tensor<f32>
    %9 = mhlo.constant dense<4.471500e-02> : tensor<f32>
    %10 = mhlo.constant dense<1.000000e+00> : tensor<1x512x1xf32>
    %11 = mhlo.constant dense<9.99999996E-13> : tensor<f32>
    %12 = mhlo.constant dense<1.000000e-01> : tensor<f32>
    %13 = mhlo.constant dense<1.11111116> : tensor<f32>
    %14 = mhlo.constant dense<0> : tensor<i64>
    %15 = mhlo.constant dense<5> : tensor<i64>
    %16 = mhlo.constant dense<0x7FC00000> : tensor<f32>
    %17 = mhlo.constant dense<-2.000000e+00> : tensor<f32>
    %18 = mhlo.constant dense<[1, 512, 768]> : tensor<3xi64>
    %19 = mhlo.constant dense<[1, 12, 512, 512]> : tensor<4xi64>
    %20 = mhlo.constant dense<[1, 768]> : tensor<2xi64>
    %21 = mhlo.constant dense<7.680000e+02> : tensor<f32>
    %22 = mhlo.constant dense<0xFF800000> : tensor<f32>
    %23 = mhlo.constant dense<1.000000e+00> : tensor<f32>
    %24 = mhlo.constant dense<0.000000e+00> : tensor<f32>
    %25 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
    %26 = chlo.broadcast_compare %25, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
    %27 = "mhlo.convert"(%26) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
    %28 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
    %29 = chlo.broadcast_compare %28, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
    %30 = "mhlo.convert"(%29) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
    %31 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
    %32 = chlo.broadcast_compare %31, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
    %33 = "mhlo.convert"(%32) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
    %34 = "mhlo.rng_uniform"(%24, %23, %19) : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<1x12x512x512xf32>
    %35 = chlo.broadcast_compare %34, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xi1>
    %36 = "mhlo.convert"(%35) : (tensor<1x12x512x512xi1>) -> tensor<1x12x512x512xf32>
    %37 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
    %38 = chlo.broadcast_compare %37, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
    %39 = "mhlo.convert"(%38) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
    %40 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
    %41 = chlo.broadcast_compare %40, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
    %42 = "mhlo.convert"(%41) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
    %43 = "mhlo.rng_uniform"(%24, %23, %19) : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<1x12x512x512xf32>
    %44 = chlo.broadcast_compare %43, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xi1>
    %45 = "mhlo.convert"(%44) : (tensor<1x12x512x512xi1>) -> tensor<1x12x512x512xf32>
    %46 = "mhlo.rng_uniform"(%24, %23, %20) : (tensor<f32>, tensor<f32>, tensor<2xi64>) -> tensor<1x768xf32>
    %47 = chlo.broadcast_compare %46, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x768xf32>, tensor<f32>) -> tensor<1x768xi1>
    %48 = "mhlo.convert"(%47) : (tensor<1x768xi1>) -> tensor<1x768xf32>
    %49 = "tf.ReadVariableOp"(%arg7) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %50 = "tf.ReadVariableOp"(%arg8) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %51 = "tf.ReadVariableOp"(%arg42) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %52 = "tf.ReadVariableOp"(%arg41) {device = ""} : (tensor<!tf.resource<tensor<768x768xf32>>>) -> tensor<768x768xf32>
    %53 = "tf.ReadVariableOp"(%arg5) {device = ""} : (tensor<!tf.resource<tensor<512x768xf32>>>) -> tensor<512x768xf32>
    %54 = "mhlo.reshape"(%53) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
    %55 = "tf.ReadVariableOp"(%arg20) {device = ""} : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
    %56 = "tf.ReadVariableOp"(%arg19) {device = ""} : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
    %57 = "tf.ReadVariableOp"(%arg22) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %58 = "tf.ReadVariableOp"(%arg21) {device = ""} : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
    %59 = "tf.ReadVariableOp"(%arg23) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %60 = "tf.ReadVariableOp"(%arg24) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %61 = "tf.ReadVariableOp"(%arg16) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %62 = "tf.ReadVariableOp"(%arg15) {device = ""} : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
    %63 = "tf.ReadVariableOp"(%arg12) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %64 = "tf.ReadVariableOp"(%arg11) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %65 = "tf.ReadVariableOp"(%arg17) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %66 = "tf.ReadVariableOp"(%arg18) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %67 = "tf.ReadVariableOp"(%arg10) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %68 = "tf.ReadVariableOp"(%arg9) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %69 = "tf.ReadVariableOp"(%arg14) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %70 = "tf.ReadVariableOp"(%arg13) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %71 = "tf.ReadVariableOp"(%arg36) {device = ""} : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
    %72 = "tf.ReadVariableOp"(%arg35) {device = ""} : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
    %73 = "tf.ReadVariableOp"(%arg38) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %74 = "tf.ReadVariableOp"(%arg37) {device = ""} : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
    %75 = "tf.ReadVariableOp"(%arg39) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %76 = "tf.ReadVariableOp"(%arg40) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %77 = "tf.ReadVariableOp"(%arg32) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %78 = "tf.ReadVariableOp"(%arg31) {device = ""} : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
    %79 = "tf.ReadVariableOp"(%arg28) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %80 = "tf.ReadVariableOp"(%arg27) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %81 = "tf.ReadVariableOp"(%arg33) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %82 = "tf.ReadVariableOp"(%arg34) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %83 = "tf.ReadVariableOp"(%arg26) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %84 = "tf.ReadVariableOp"(%arg25) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %85 = "tf.ReadVariableOp"(%arg30) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %86 = "tf.ReadVariableOp"(%arg29) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %87 = "tf.ReadVariableOp"(%arg6) {device = ""} : (tensor<!tf.resource<tensor<16x768xf32>>>) -> tensor<16x768xf32>
    %88 = "tf.ReadVariableOp"(%arg44) {device = ""} : (tensor<!tf.resource<tensor<5xf32>>>) -> tensor<5xf32>
    %89 = "tf.ReadVariableOp"(%arg43) {device = ""} : (tensor<!tf.resource<tensor<768x5xf32>>>) -> tensor<768x5xf32>
    %90 = chlo.broadcast_subtract %8, %23 : (tensor<f32>, tensor<f32>) -> tensor<f32>
    %91 = "mhlo.reshape"(%arg0) : (tensor<1x512xi32>) -> tensor<512xi32>
    %92 = "tf.ReadVariableOp"(%arg4) : (tensor<!tf.resource<tensor<100x768xf32>>>) -> tensor<100x768xf32>
    %93 = "mhlo.torch_index_select"(%92, %91) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<100x768xf32>, tensor<512xi32>) -> tensor<512x768xf32>
    %94 = "mhlo.reshape"(%93) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
    %95 = chlo.broadcast_add %94, %54 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %96 = "mhlo.reshape"(%arg1) : (tensor<1x512xi32>) -> tensor<1x1x512xi32>
    %97 = "mhlo.convert"(%96) : (tensor<1x1x512xi32>) -> tensor<1x1x512xf32>
    %98 = chlo.broadcast_multiply %97, %10 : (tensor<1x1x512xf32>, tensor<1x512x1xf32>) -> tensor<1x512x512xf32>
    %99 = "mhlo.reshape"(%98) : (tensor<1x512x512xf32>) -> tensor<1x1x512x512xf32>
    %100 = chlo.broadcast_subtract %23, %99 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x1x512x512xf32>) -> tensor<1x1x512x512xf32>
    %101 = chlo.broadcast_multiply %100, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x512x512xf32>, tensor<f32>) -> tensor<1x1x512x512xf32>
    %102 = "mhlo.reshape"(%arg2) : (tensor<1x512xi32>) -> tensor<512xi32>
    %103 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<16xi32>
    %104 = "mhlo.broadcast_in_dim"(%103) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<16xi32>) -> tensor<512x16xi32>
    %105 = "mhlo.broadcast_in_dim"(%102) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<512xi32>) -> tensor<512x16xi32>
    %106 = "mhlo.compare"(%105, %104) {comparison_direction = "EQ"} : (tensor<512x16xi32>, tensor<512x16xi32>) -> tensor<512x16xi1>
    %107 = "mhlo.broadcast"(%23) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
    %108 = "mhlo.broadcast"(%24) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
    %109 = "mhlo.select"(%106, %107, %108) : (tensor<512x16xi1>, tensor<512x16xf32>, tensor<512x16xf32>) -> tensor<512x16xf32>
    %110 = "mhlo.dot"(%109, %87) : (tensor<512x16xf32>, tensor<16x768xf32>) -> tensor<512x768xf32>
    %111 = "mhlo.reshape"(%110) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
    %112 = chlo.broadcast_add %95, %111 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %113 = "mhlo.reduce"(%112, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %114 = chlo.broadcast_divide %113, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %115 = "mhlo.reshape"(%114) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %116 = chlo.broadcast_subtract %112, %115 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %117 = chlo.broadcast_multiply %116, %116 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %118 = "mhlo.reduce"(%117, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %119 = chlo.broadcast_divide %118, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %120 = "mhlo.reshape"(%119) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %121 = chlo.broadcast_add %120, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %122 = "mhlo.rsqrt"(%121) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %123 = chlo.broadcast_multiply %122, %49 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %124 = chlo.broadcast_multiply %112, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %125 = chlo.broadcast_multiply %115, %123 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %126 = chlo.broadcast_subtract %50, %125 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %127 = chlo.broadcast_add %124, %126 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %128 = chlo.broadcast_multiply %127, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %129 = chlo.broadcast_multiply %128, %27 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %130 = "mhlo.einsum"(%129, %64) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %131 = chlo.broadcast_add %130, %63 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %132 = "mhlo.einsum"(%129, %68) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %133 = chlo.broadcast_add %132, %67 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %134 = chlo.broadcast_multiply %133, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
    %135 = "mhlo.einsum"(%131, %134) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
    %136 = chlo.broadcast_add %135, %101 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
    %137 = "mhlo.reduce"(%136, %22) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %138 = "mhlo.broadcast_in_dim"(%137) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %139 = mhlo.subtract %136, %138 : tensor<1x12x512x512xf32>
    %140 = "mhlo.exponential"(%139) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %141 = "mhlo.reduce"(%140, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %142 = "mhlo.broadcast_in_dim"(%141) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %143 = mhlo.divide %140, %142 : tensor<1x12x512x512xf32>
    %144 = chlo.broadcast_multiply %143, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
    %145 = chlo.broadcast_multiply %144, %36 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %146 = "mhlo.einsum"(%129, %70) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %147 = chlo.broadcast_add %146, %69 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %148 = "mhlo.einsum"(%145, %147) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %149 = "mhlo.einsum"(%148, %62) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
    %150 = chlo.broadcast_add %149, %61 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %151 = chlo.broadcast_multiply %150, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %152 = chlo.broadcast_multiply %151, %30 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %153 = chlo.broadcast_add %129, %152 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %154 = "mhlo.reduce"(%153, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %155 = chlo.broadcast_divide %154, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %156 = "mhlo.reshape"(%155) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %157 = chlo.broadcast_subtract %153, %156 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %158 = chlo.broadcast_multiply %157, %157 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %159 = "mhlo.reduce"(%158, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %160 = chlo.broadcast_divide %159, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %161 = "mhlo.reshape"(%160) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %162 = chlo.broadcast_add %161, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %163 = "mhlo.rsqrt"(%162) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %164 = chlo.broadcast_multiply %163, %65 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %165 = chlo.broadcast_multiply %153, %164 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %166 = chlo.broadcast_multiply %156, %164 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %167 = chlo.broadcast_subtract %66, %166 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %168 = chlo.broadcast_add %165, %167 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %169 = "mhlo.einsum"(%168, %56) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
    %170 = chlo.broadcast_add %169, %55 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
    %171 = chlo.broadcast_power %170, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %172 = chlo.broadcast_multiply %171, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %173 = chlo.broadcast_add %170, %172 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %174 = chlo.broadcast_multiply %173, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %175 = "mhlo.tanh"(%174) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %176 = chlo.broadcast_add %175, %23 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %177 = chlo.broadcast_multiply %170, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %178 = chlo.broadcast_multiply %177, %176 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %179 = "mhlo.einsum"(%178, %58) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
    %180 = chlo.broadcast_add %179, %57 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %181 = chlo.broadcast_multiply %180, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %182 = chlo.broadcast_multiply %181, %33 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %183 = chlo.broadcast_add %182, %168 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %184 = "mhlo.reduce"(%183, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %185 = chlo.broadcast_divide %184, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %186 = "mhlo.reshape"(%185) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %187 = chlo.broadcast_subtract %183, %186 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %188 = chlo.broadcast_multiply %187, %187 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %189 = "mhlo.reduce"(%188, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %190 = chlo.broadcast_divide %189, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %191 = "mhlo.reshape"(%190) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %192 = chlo.broadcast_add %191, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %193 = "mhlo.rsqrt"(%192) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %194 = chlo.broadcast_multiply %193, %59 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %195 = chlo.broadcast_multiply %183, %194 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %196 = chlo.broadcast_multiply %186, %194 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %197 = chlo.broadcast_subtract %60, %196 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %198 = chlo.broadcast_add %195, %197 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %199 = "mhlo.einsum"(%198, %80) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %200 = chlo.broadcast_add %199, %79 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %201 = "mhlo.einsum"(%198, %84) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %202 = chlo.broadcast_add %201, %83 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %203 = chlo.broadcast_multiply %202, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
    %204 = "mhlo.einsum"(%200, %203) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
    %205 = chlo.broadcast_add %204, %101 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
    %206 = "mhlo.reduce"(%205, %22) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %207 = "mhlo.broadcast_in_dim"(%206) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %208 = mhlo.subtract %205, %207 : tensor<1x12x512x512xf32>
    %209 = "mhlo.exponential"(%208) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %210 = "mhlo.reduce"(%209, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %211 = "mhlo.broadcast_in_dim"(%210) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %212 = mhlo.divide %209, %211 : tensor<1x12x512x512xf32>
    %213 = chlo.broadcast_multiply %212, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
    %214 = chlo.broadcast_multiply %213, %45 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %215 = "mhlo.einsum"(%198, %86) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %216 = chlo.broadcast_add %215, %85 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %217 = "mhlo.einsum"(%214, %216) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %218 = "mhlo.einsum"(%217, %78) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
    %219 = chlo.broadcast_add %218, %77 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %220 = chlo.broadcast_multiply %219, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %221 = chlo.broadcast_multiply %220, %39 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %222 = chlo.broadcast_add %198, %221 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %223 = "mhlo.reduce"(%222, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %224 = chlo.broadcast_divide %223, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %225 = "mhlo.reshape"(%224) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %226 = chlo.broadcast_subtract %222, %225 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %227 = chlo.broadcast_multiply %226, %226 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %228 = "mhlo.reduce"(%227, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %229 = chlo.broadcast_divide %228, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %230 = "mhlo.reshape"(%229) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %231 = chlo.broadcast_add %230, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %232 = "mhlo.rsqrt"(%231) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %233 = chlo.broadcast_multiply %232, %81 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %234 = chlo.broadcast_multiply %222, %233 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %235 = chlo.broadcast_multiply %225, %233 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %236 = chlo.broadcast_subtract %82, %235 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %237 = chlo.broadcast_add %234, %236 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %238 = "mhlo.einsum"(%237, %72) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
    %239 = chlo.broadcast_add %238, %71 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
    %240 = chlo.broadcast_power %239, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %241 = chlo.broadcast_multiply %240, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %242 = chlo.broadcast_add %239, %241 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %243 = chlo.broadcast_multiply %242, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %244 = "mhlo.tanh"(%243) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %245 = chlo.broadcast_add %244, %23 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %246 = chlo.broadcast_multiply %239, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %247 = chlo.broadcast_multiply %246, %245 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %248 = "mhlo.einsum"(%247, %74) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
    %249 = chlo.broadcast_add %248, %73 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %250 = chlo.broadcast_multiply %249, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %251 = chlo.broadcast_multiply %250, %42 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %252 = chlo.broadcast_add %251, %237 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %253 = "mhlo.reduce"(%252, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %254 = chlo.broadcast_divide %253, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %255 = "mhlo.reshape"(%254) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %256 = chlo.broadcast_subtract %252, %255 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %257 = chlo.broadcast_multiply %256, %256 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %258 = "mhlo.reduce"(%257, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %259 = chlo.broadcast_divide %258, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %260 = "mhlo.reshape"(%259) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %261 = chlo.broadcast_add %260, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %262 = "mhlo.rsqrt"(%261) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %263 = chlo.broadcast_multiply %262, %75 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %264 = chlo.broadcast_multiply %252, %263 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %265 = chlo.broadcast_multiply %255, %263 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %266 = chlo.broadcast_subtract %76, %265 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %267 = chlo.broadcast_add %264, %266 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %268 = "mhlo.slice"(%267) {limit_indices = dense<[1, 1, 768]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x512x768xf32>) -> tensor<1x1x768xf32>
    %269 = "mhlo.reshape"(%268) : (tensor<1x1x768xf32>) -> tensor<1x768xf32>
    %270 = "mhlo.dot"(%269, %52) : (tensor<1x768xf32>, tensor<768x768xf32>) -> tensor<1x768xf32>
    %271 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<768xf32>) -> tensor<1x768xf32>
    %272 = mhlo.add %270, %271 : tensor<1x768xf32>
    %273 = "mhlo.tanh"(%272) : (tensor<1x768xf32>) -> tensor<1x768xf32>
    %274 = chlo.broadcast_multiply %273, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x768xf32>, tensor<f32>) -> tensor<1x768xf32>
    %275 = chlo.broadcast_multiply %274, %48 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
    %276 = "mhlo.dot"(%275, %89) : (tensor<1x768xf32>, tensor<768x5xf32>) -> tensor<1x5xf32>
    %277 = "mhlo.broadcast_in_dim"(%88) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<1x5xf32>
    %278 = mhlo.add %276, %277 : tensor<1x5xf32>
    %279 = chlo.broadcast_power %239, %90 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %280 = chlo.broadcast_power %170, %90 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %281 = "mhlo.convert"(%arg3) : (tensor<1xi32>) -> tensor<1xf32>
    %282 = "mhlo.convert"(%281) : (tensor<1xf32>) -> tensor<1xi64>
    %283 = "tf.ReadVariableOp"(%arg45) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
    %284 = chlo.broadcast_subtract %23, %2 : (tensor<f32>, tensor<f32>) -> tensor<f32>
    %285 = chlo.broadcast_compare %278, %284 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "LE"} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xi1>
    %286 = chlo.broadcast_minimum %278, %284 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xf32>
    %287 = chlo.broadcast_compare %286, %2 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xi1>
    %288 = chlo.broadcast_maximum %286, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xf32>
    %289 = "mhlo.log"(%288) : (tensor<1x5xf32>) -> tensor<1x5xf32>
    %290 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5xi64>
    %291 = "mhlo.broadcast_in_dim"(%290) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xi64>) -> tensor<1x5xi64>
    %292 = "mhlo.broadcast_in_dim"(%282) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xi64>) -> tensor<1x5xi64>
    %293 = "mhlo.compare"(%292, %291) {comparison_direction = "EQ"} : (tensor<1x5xi64>, tensor<1x5xi64>) -> tensor<1x5xi1>
    %294 = "mhlo.broadcast"(%23) {broadcast_sizes = dense<[1, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<1x5xf32>
    %295 = "mhlo.broadcast"(%24) {broadcast_sizes = dense<[1, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<1x5xf32>
    %296 = "mhlo.select"(%293, %294, %295) : (tensor<1x5xi1>, tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %297 = chlo.broadcast_compare %14, %282 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "LE"} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi1>
    %298 = chlo.broadcast_compare %282, %15 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "LT"} : (tensor<1xi64>, tensor<i64>) -> tensor<1xi1>
    %299 = chlo.broadcast_and %297, %298 : (tensor<1xi1>, tensor<1xi1>) -> tensor<1xi1>
    %300 = chlo.broadcast_select %299, %24, %16 : (tensor<1xi1>, tensor<f32>, tensor<f32>) -> tensor<1xf32>
    %301 = "mhlo.reshape"(%300) : (tensor<1xf32>) -> tensor<1x1xf32>
    %302 = chlo.broadcast_add %296, %301 : (tensor<1x5xf32>, tensor<1x1xf32>) -> tensor<1x5xf32>
    %303 = "mhlo.reduce"(%289, %22) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
    %304 = "mhlo.broadcast_in_dim"(%303) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
    %305 = mhlo.subtract %289, %304 : tensor<1x5xf32>
    %306 = "mhlo.exponential"(%305) : (tensor<1x5xf32>) -> tensor<1x5xf32>
    %307 = "mhlo.reduce"(%306, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
    %308 = "mhlo.log"(%307) : (tensor<1xf32>) -> tensor<1xf32>
    %309 = "mhlo.broadcast_in_dim"(%308) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
    %310 = mhlo.subtract %305, %309 : tensor<1x5xf32>
    %311 = "mhlo.negate"(%302) : (tensor<1x5xf32>) -> tensor<1x5xf32>
    %312 = chlo.broadcast_compare %311, %24 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xi1>
    %313 = chlo.broadcast_multiply %310, %311 : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %314 = chlo.broadcast_select %312, %24, %313 : (tensor<1x5xi1>, tensor<f32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %315 = "mhlo.reduce"(%314, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
    %316 = "mhlo.reduce"(%289, %22) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
    %317 = "mhlo.broadcast_in_dim"(%316) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
    %318 = mhlo.subtract %289, %317 : tensor<1x5xf32>
    %319 = "mhlo.exponential"(%318) : (tensor<1x5xf32>) -> tensor<1x5xf32>
    %320 = "mhlo.reduce"(%319, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
    %321 = "mhlo.broadcast_in_dim"(%320) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
    %322 = mhlo.divide %319, %321 : tensor<1x5xf32>
    %323 = chlo.broadcast_subtract %322, %302 : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %324 = "mhlo.reduce"(%315, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>, tensor<f32>) -> tensor<f32>
    %325 = chlo.broadcast_compare %23, %24 {comparison_direction = "EQ"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
    %326 = chlo.broadcast_divide %23, %23 : (tensor<f32>, tensor<f32>) -> tensor<f32>
    %327 = chlo.broadcast_select %325, %24, %326 : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
    %328 = "mhlo.reshape"(%327) : (tensor<f32>) -> tensor<1x1xf32>
    %329 = chlo.broadcast_multiply %328, %323 : (tensor<1x1xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %330 = chlo.broadcast_divide %23, %288 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %331 = chlo.broadcast_multiply %329, %330 : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %332 = chlo.broadcast_select %287, %331, %3 : (tensor<1x5xi1>, tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %333 = chlo.broadcast_select %285, %332, %3 : (tensor<1x5xi1>, tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
    %334 = "mhlo.reduce"(%333, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<5xf32>
    %335 = chlo.broadcast_multiply %283, %334 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<5xf32>) -> tensor<5xf32>
    %336 = "tf.ReadVariableOp"(%arg44) : (tensor<!tf.resource<tensor<5xf32>>>) -> tensor<5xf32>
    %337 = chlo.broadcast_subtract %336, %335 : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
    "tf.AssignVariableOp"(%arg44, %337) : (tensor<!tf.resource<tensor<5xf32>>>, tensor<5xf32>) -> ()
    %338 = "mhlo.transpose"(%89) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<768x5xf32>) -> tensor<5x768xf32>
    %339 = "mhlo.dot"(%333, %338) : (tensor<1x5xf32>, tensor<5x768xf32>) -> tensor<1x768xf32>
    %340 = chlo.broadcast_multiply %339, %48 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
    %341 = chlo.broadcast_multiply %340, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x768xf32>, tensor<f32>) -> tensor<1x768xf32>
    %342 = chlo.broadcast_multiply %273, %273 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
    %343 = chlo.broadcast_subtract %23, %342 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x768xf32>) -> tensor<1x768xf32>
    %344 = chlo.broadcast_multiply %341, %343 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
    %345 = "mhlo.reduce"(%344, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1x768xf32>, tensor<f32>) -> tensor<768xf32>
    %346 = chlo.broadcast_multiply %283, %345 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %347 = "tf.ReadVariableOp"(%arg42) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %348 = chlo.broadcast_subtract %347, %346 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg42, %348) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %349 = "mhlo.transpose"(%52) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<768x768xf32>) -> tensor<768x768xf32>
    %350 = "mhlo.dot"(%344, %349) : (tensor<1x768xf32>, tensor<768x768xf32>) -> tensor<1x768xf32>
    %351 = "mhlo.reshape"(%350) : (tensor<1x768xf32>) -> tensor<1x1x768xf32>
    %352 = "mhlo.pad"(%351, %24) {edge_padding_high = dense<[0, 511, 0]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x1x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %353 = chlo.broadcast_multiply %352, %263 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %354 = chlo.broadcast_multiply %352, %252 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %355 = "mhlo.negate"(%352) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %356 = chlo.broadcast_multiply %355, %263 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %357 = "mhlo.reduce"(%356, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %358 = "mhlo.reshape"(%357) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %359 = "mhlo.broadcast_in_dim"(%358) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %360 = chlo.broadcast_divide %23, %21 : (tensor<f32>, tensor<f32>) -> tensor<f32>
    %361 = chlo.broadcast_multiply %359, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %362 = chlo.broadcast_multiply %255, %355 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %363 = chlo.broadcast_add %354, %362 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %364 = chlo.broadcast_multiply %363, %75 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %365 = "mhlo.reduce"(%364, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %366 = "mhlo.reshape"(%365) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %367 = chlo.broadcast_multiply %262, %262 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %368 = chlo.broadcast_multiply %367, %262 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %369 = chlo.broadcast_divide %366, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %370 = chlo.broadcast_multiply %368, %369 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %371 = "mhlo.broadcast_in_dim"(%370) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %372 = chlo.broadcast_multiply %371, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %373 = chlo.broadcast_multiply %372, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %374 = chlo.broadcast_subtract %252, %255 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %375 = chlo.broadcast_multiply %373, %374 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %376 = chlo.broadcast_add %353, %375 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %377 = chlo.broadcast_add %376, %361 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %378 = chlo.broadcast_multiply %377, %42 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %379 = chlo.broadcast_multiply %378, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %380 = "mhlo.reduce"(%379, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %381 = chlo.broadcast_multiply %283, %380 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %382 = "tf.ReadVariableOp"(%arg38) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %383 = chlo.broadcast_subtract %382, %381 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg38, %383) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %384 = "mhlo.einsum"(%379, %74) {einsum_config = "abd,cd->abc"} : (tensor<1x512x768xf32>, tensor<3072x768xf32>) -> tensor<1x512x3072xf32>
    %385 = chlo.broadcast_multiply %384, %245 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %386 = chlo.broadcast_multiply %385, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %387 = chlo.broadcast_multiply %384, %246 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %388 = chlo.broadcast_multiply %244, %244 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %389 = chlo.broadcast_subtract %23, %388 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %390 = chlo.broadcast_multiply %387, %389 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %391 = chlo.broadcast_multiply %390, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %392 = chlo.broadcast_multiply %391, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %393 = chlo.broadcast_multiply %392, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %394 = chlo.broadcast_multiply %393, %279 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %395 = chlo.broadcast_add %386, %391 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %396 = chlo.broadcast_add %395, %394 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %397 = "mhlo.reduce"(%396, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<3072xf32>
    %398 = chlo.broadcast_multiply %283, %397 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072xf32>) -> tensor<3072xf32>
    %399 = "tf.ReadVariableOp"(%arg36) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
    %400 = chlo.broadcast_subtract %399, %398 : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<3072xf32>
    "tf.AssignVariableOp"(%arg36, %400) : (tensor<!tf.resource<tensor<3072xf32>>>, tensor<3072xf32>) -> ()
    %401 = "mhlo.einsum"(%396, %72) {einsum_config = "abd,cd->abc"} : (tensor<1x512x3072xf32>, tensor<768x3072xf32>) -> tensor<1x512x768xf32>
    %402 = chlo.broadcast_add %377, %401 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %403 = chlo.broadcast_multiply %402, %233 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %404 = chlo.broadcast_multiply %402, %222 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %405 = "mhlo.negate"(%402) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %406 = chlo.broadcast_multiply %405, %233 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %407 = "mhlo.reduce"(%406, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %408 = "mhlo.reshape"(%407) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %409 = "mhlo.broadcast_in_dim"(%408) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %410 = chlo.broadcast_multiply %409, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %411 = chlo.broadcast_multiply %225, %405 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %412 = chlo.broadcast_add %404, %411 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %413 = chlo.broadcast_multiply %412, %81 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %414 = "mhlo.reduce"(%413, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %415 = "mhlo.reshape"(%414) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %416 = chlo.broadcast_multiply %232, %232 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %417 = chlo.broadcast_multiply %416, %232 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %418 = chlo.broadcast_divide %415, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %419 = chlo.broadcast_multiply %417, %418 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %420 = "mhlo.broadcast_in_dim"(%419) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %421 = chlo.broadcast_multiply %420, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %422 = chlo.broadcast_multiply %421, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %423 = chlo.broadcast_subtract %222, %225 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %424 = chlo.broadcast_multiply %422, %423 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %425 = chlo.broadcast_add %403, %424 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %426 = chlo.broadcast_add %425, %410 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %427 = chlo.broadcast_multiply %426, %39 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %428 = chlo.broadcast_multiply %427, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %429 = "mhlo.reduce"(%428, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %430 = chlo.broadcast_multiply %283, %429 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %431 = "tf.ReadVariableOp"(%arg32) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %432 = chlo.broadcast_subtract %431, %430 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg32, %432) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %433 = "mhlo.einsum"(%428, %78) {einsum_config = "abe,cde->abcd"} : (tensor<1x512x768xf32>, tensor<12x64x768xf32>) -> tensor<1x512x12x64xf32>
    %434 = "mhlo.einsum"(%433, %216) {einsum_config = "abcd,aecd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
    %435 = chlo.broadcast_multiply %434, %45 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %436 = chlo.broadcast_multiply %435, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
    %437 = chlo.broadcast_multiply %436, %212 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %438 = "mhlo.reduce"(%437, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %439 = "mhlo.reshape"(%438) : (tensor<1x12x512xf32>) -> tensor<1x12x512x1xf32>
    %440 = chlo.broadcast_subtract %436, %439 : (tensor<1x12x512x512xf32>, tensor<1x12x512x1xf32>) -> tensor<1x12x512x512xf32>
    %441 = chlo.broadcast_multiply %440, %212 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %442 = "mhlo.einsum"(%441, %203) {einsum_config = "acbe,abcd->aecd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %443 = "mhlo.reduce"(%442, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
    %444 = chlo.broadcast_multiply %283, %443 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    %445 = "tf.ReadVariableOp"(%arg28) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %446 = chlo.broadcast_subtract %445, %444 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    "tf.AssignVariableOp"(%arg28, %446) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
    %447 = "mhlo.einsum"(%442, %80) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
    %448 = "mhlo.einsum"(%442, %198) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
    %449 = chlo.broadcast_multiply %283, %448 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    %450 = "tf.ReadVariableOp"(%arg27) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %451 = chlo.broadcast_subtract %450, %449 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    "tf.AssignVariableOp"(%arg27, %451) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
    %452 = "mhlo.einsum"(%441, %200) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %453 = chlo.broadcast_multiply %452, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
    %454 = "mhlo.reduce"(%453, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
    %455 = chlo.broadcast_multiply %283, %454 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    %456 = "tf.ReadVariableOp"(%arg26) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %457 = chlo.broadcast_subtract %456, %455 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    "tf.AssignVariableOp"(%arg26, %457) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
    %458 = "mhlo.einsum"(%453, %84) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
    %459 = "mhlo.einsum"(%453, %198) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
    %460 = chlo.broadcast_multiply %283, %459 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    %461 = "tf.ReadVariableOp"(%arg25) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %462 = chlo.broadcast_subtract %461, %460 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    "tf.AssignVariableOp"(%arg25, %462) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
    %463 = "mhlo.einsum"(%433, %214) {einsum_config = "abcd,acbe->aecd"} : (tensor<1x512x12x64xf32>, tensor<1x12x512x512xf32>) -> tensor<1x512x12x64xf32>
    %464 = "mhlo.reduce"(%463, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
    %465 = chlo.broadcast_multiply %283, %464 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    %466 = "tf.ReadVariableOp"(%arg30) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %467 = chlo.broadcast_subtract %466, %465 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    "tf.AssignVariableOp"(%arg30, %467) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
    %468 = "mhlo.einsum"(%463, %86) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
    %469 = chlo.broadcast_add %426, %468 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %470 = chlo.broadcast_add %447, %458 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %471 = chlo.broadcast_add %469, %470 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %472 = chlo.broadcast_multiply %471, %194 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %473 = chlo.broadcast_multiply %471, %183 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %474 = "mhlo.negate"(%471) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %475 = chlo.broadcast_multiply %474, %194 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %476 = "mhlo.reduce"(%475, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %477 = "mhlo.reshape"(%476) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %478 = "mhlo.broadcast_in_dim"(%477) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %479 = chlo.broadcast_multiply %478, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %480 = chlo.broadcast_multiply %186, %474 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %481 = chlo.broadcast_add %473, %480 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %482 = chlo.broadcast_multiply %481, %59 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %483 = "mhlo.reduce"(%482, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %484 = "mhlo.reshape"(%483) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %485 = chlo.broadcast_multiply %193, %193 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %486 = chlo.broadcast_multiply %485, %193 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %487 = chlo.broadcast_divide %484, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %488 = chlo.broadcast_multiply %486, %487 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %489 = "mhlo.broadcast_in_dim"(%488) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %490 = chlo.broadcast_multiply %489, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %491 = chlo.broadcast_multiply %490, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %492 = chlo.broadcast_subtract %183, %186 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %493 = chlo.broadcast_multiply %491, %492 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %494 = chlo.broadcast_add %472, %493 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %495 = chlo.broadcast_add %494, %479 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %496 = chlo.broadcast_multiply %495, %33 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %497 = chlo.broadcast_multiply %496, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %498 = "mhlo.reduce"(%497, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %499 = chlo.broadcast_multiply %283, %498 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %500 = "tf.ReadVariableOp"(%arg22) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %501 = chlo.broadcast_subtract %500, %499 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg22, %501) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %502 = "mhlo.einsum"(%497, %58) {einsum_config = "abd,cd->abc"} : (tensor<1x512x768xf32>, tensor<3072x768xf32>) -> tensor<1x512x3072xf32>
    %503 = chlo.broadcast_multiply %502, %176 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %504 = chlo.broadcast_multiply %503, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %505 = chlo.broadcast_multiply %502, %177 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %506 = chlo.broadcast_multiply %175, %175 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %507 = chlo.broadcast_subtract %23, %506 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %508 = chlo.broadcast_multiply %505, %507 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %509 = chlo.broadcast_multiply %508, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %510 = chlo.broadcast_multiply %509, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %511 = chlo.broadcast_multiply %510, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %512 = chlo.broadcast_multiply %511, %280 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %513 = chlo.broadcast_add %504, %509 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %514 = chlo.broadcast_add %513, %512 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %515 = "mhlo.reduce"(%514, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<3072xf32>
    %516 = chlo.broadcast_multiply %283, %515 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072xf32>) -> tensor<3072xf32>
    %517 = "tf.ReadVariableOp"(%arg20) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
    %518 = chlo.broadcast_subtract %517, %516 : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<3072xf32>
    "tf.AssignVariableOp"(%arg20, %518) : (tensor<!tf.resource<tensor<3072xf32>>>, tensor<3072xf32>) -> ()
    %519 = "mhlo.einsum"(%514, %56) {einsum_config = "abd,cd->abc"} : (tensor<1x512x3072xf32>, tensor<768x3072xf32>) -> tensor<1x512x768xf32>
    %520 = chlo.broadcast_add %495, %519 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %521 = chlo.broadcast_multiply %520, %164 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %522 = chlo.broadcast_multiply %520, %153 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %523 = "mhlo.negate"(%520) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %524 = chlo.broadcast_multiply %523, %164 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %525 = "mhlo.reduce"(%524, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %526 = "mhlo.reshape"(%525) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %527 = "mhlo.broadcast_in_dim"(%526) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %528 = chlo.broadcast_multiply %527, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %529 = chlo.broadcast_multiply %156, %523 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %530 = chlo.broadcast_add %522, %529 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %531 = chlo.broadcast_multiply %530, %65 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %532 = "mhlo.reduce"(%531, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %533 = "mhlo.reshape"(%532) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %534 = chlo.broadcast_multiply %163, %163 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %535 = chlo.broadcast_multiply %534, %163 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %536 = chlo.broadcast_divide %533, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %537 = chlo.broadcast_multiply %535, %536 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %538 = "mhlo.broadcast_in_dim"(%537) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %539 = chlo.broadcast_multiply %538, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %540 = chlo.broadcast_multiply %539, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %541 = chlo.broadcast_subtract %153, %156 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %542 = chlo.broadcast_multiply %540, %541 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %543 = chlo.broadcast_add %521, %542 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %544 = chlo.broadcast_add %543, %528 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %545 = chlo.broadcast_multiply %544, %30 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %546 = chlo.broadcast_multiply %545, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %547 = "mhlo.reduce"(%546, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %548 = chlo.broadcast_multiply %283, %547 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %549 = "tf.ReadVariableOp"(%arg16) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %550 = chlo.broadcast_subtract %549, %548 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg16, %550) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %551 = "mhlo.einsum"(%546, %62) {einsum_config = "abe,cde->abcd"} : (tensor<1x512x768xf32>, tensor<12x64x768xf32>) -> tensor<1x512x12x64xf32>
    %552 = "mhlo.einsum"(%551, %147) {einsum_config = "abcd,aecd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
    %553 = chlo.broadcast_multiply %552, %36 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %554 = chlo.broadcast_multiply %553, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
    %555 = chlo.broadcast_multiply %554, %143 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %556 = "mhlo.reduce"(%555, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %557 = "mhlo.reshape"(%556) : (tensor<1x12x512xf32>) -> tensor<1x12x512x1xf32>
    %558 = chlo.broadcast_subtract %554, %557 : (tensor<1x12x512x512xf32>, tensor<1x12x512x1xf32>) -> tensor<1x12x512x512xf32>
    %559 = chlo.broadcast_multiply %558, %143 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %560 = "mhlo.einsum"(%559, %134) {einsum_config = "acbe,abcd->aecd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %561 = "mhlo.reduce"(%560, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
    %562 = chlo.broadcast_multiply %283, %561 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    %563 = "tf.ReadVariableOp"(%arg12) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %564 = chlo.broadcast_subtract %563, %562 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    "tf.AssignVariableOp"(%arg12, %564) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
    %565 = "mhlo.einsum"(%560, %64) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
    %566 = "mhlo.einsum"(%560, %129) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
    %567 = chlo.broadcast_multiply %283, %566 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    %568 = "tf.ReadVariableOp"(%arg11) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %569 = chlo.broadcast_subtract %568, %567 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    "tf.AssignVariableOp"(%arg11, %569) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
    %570 = "mhlo.einsum"(%559, %131) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %571 = chlo.broadcast_multiply %570, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
    %572 = "mhlo.reduce"(%571, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
    %573 = chlo.broadcast_multiply %283, %572 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    %574 = "tf.ReadVariableOp"(%arg10) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %575 = chlo.broadcast_subtract %574, %573 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    "tf.AssignVariableOp"(%arg10, %575) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
    %576 = "mhlo.einsum"(%571, %68) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
    %577 = "mhlo.einsum"(%571, %129) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
    %578 = chlo.broadcast_multiply %283, %577 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    %579 = "tf.ReadVariableOp"(%arg9) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %580 = chlo.broadcast_subtract %579, %578 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    "tf.AssignVariableOp"(%arg9, %580) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
    %581 = "mhlo.einsum"(%551, %145) {einsum_config = "abcd,acbe->aecd"} : (tensor<1x512x12x64xf32>, tensor<1x12x512x512xf32>) -> tensor<1x512x12x64xf32>
    %582 = "mhlo.reduce"(%581, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
    %583 = chlo.broadcast_multiply %283, %582 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    %584 = "tf.ReadVariableOp"(%arg14) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %585 = chlo.broadcast_subtract %584, %583 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
    "tf.AssignVariableOp"(%arg14, %585) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
    %586 = "mhlo.einsum"(%581, %70) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
    %587 = chlo.broadcast_add %544, %586 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %588 = chlo.broadcast_add %565, %576 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %589 = chlo.broadcast_add %587, %588 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %590 = chlo.broadcast_multiply %589, %27 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %591 = chlo.broadcast_multiply %590, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %592 = chlo.broadcast_multiply %591, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %593 = chlo.broadcast_multiply %591, %112 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %594 = "mhlo.negate"(%591) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %595 = chlo.broadcast_multiply %594, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %596 = "mhlo.reduce"(%595, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %597 = "mhlo.reshape"(%596) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %598 = "mhlo.broadcast_in_dim"(%597) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %599 = chlo.broadcast_multiply %598, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %600 = chlo.broadcast_multiply %115, %594 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %601 = chlo.broadcast_add %593, %600 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %602 = chlo.broadcast_multiply %601, %49 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %603 = "mhlo.reduce"(%602, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
    %604 = "mhlo.reshape"(%603) : (tensor<512xf32>) -> tensor<1x512x1xf32>
    %605 = chlo.broadcast_multiply %122, %122 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %606 = chlo.broadcast_multiply %605, %122 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %607 = chlo.broadcast_divide %604, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %608 = chlo.broadcast_multiply %606, %607 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %609 = "mhlo.broadcast_in_dim"(%608) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %610 = chlo.broadcast_multiply %609, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %611 = chlo.broadcast_multiply %610, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
    %612 = chlo.broadcast_subtract %112, %115 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %613 = chlo.broadcast_multiply %611, %612 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %614 = chlo.broadcast_add %592, %613 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %615 = chlo.broadcast_add %614, %599 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %616 = "mhlo.reshape"(%615) : (tensor<1x512x768xf32>) -> tensor<512x768xf32>
    %617 = chlo.broadcast_multiply %283, %616 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<512x768xf32>) -> tensor<512x768xf32>
    %618 = "tf.ReadVariableOp"(%arg5) : (tensor<!tf.resource<tensor<512x768xf32>>>) -> tensor<512x768xf32>
    %619 = chlo.broadcast_subtract %618, %617 : (tensor<512x768xf32>, tensor<512x768xf32>) -> tensor<512x768xf32>
    "tf.AssignVariableOp"(%arg5, %619) : (tensor<!tf.resource<tensor<512x768xf32>>>, tensor<512x768xf32>) -> ()
    %620 = "mhlo.reshape"(%615) : (tensor<1x512x768xf32>) -> tensor<512x768xf32>
    %621 = "mhlo.transpose"(%109) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<512x16xf32>) -> tensor<16x512xf32>
    %622 = "mhlo.dot"(%621, %620) : (tensor<16x512xf32>, tensor<512x768xf32>) -> tensor<16x768xf32>
    %623 = chlo.broadcast_multiply %283, %622 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<16x768xf32>) -> tensor<16x768xf32>
    %624 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf.resource<tensor<16x768xf32>>>) -> tensor<16x768xf32>
    %625 = chlo.broadcast_subtract %624, %623 : (tensor<16x768xf32>, tensor<16x768xf32>) -> tensor<16x768xf32>
    "tf.AssignVariableOp"(%arg6, %625) : (tensor<!tf.resource<tensor<16x768xf32>>>, tensor<16x768xf32>) -> ()
    %626 = "mhlo.negate"(%620) : (tensor<512x768xf32>) -> tensor<512x768xf32>
    %627 = chlo.broadcast_multiply %626, %283 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<512x768xf32>, tensor<f32>) -> tensor<512x768xf32>
    "tf.ResourceScatterAdd"(%arg4, %91, %627) {_class = ["loc:@bert_classifier/bert_encoder_1/word_embeddings/Gather/resource"], device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf.resource<tensor<100x768xf32>>>, tensor<512xi32>, tensor<512x768xf32>) -> ()
    %628 = chlo.broadcast_multiply %122, %601 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %629 = "mhlo.reduce"(%628, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %630 = chlo.broadcast_multiply %283, %629 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %631 = "tf.ReadVariableOp"(%arg7) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %632 = chlo.broadcast_subtract %631, %630 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg7, %632) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %633 = "mhlo.reduce"(%591, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %634 = chlo.broadcast_multiply %283, %633 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %635 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %636 = chlo.broadcast_subtract %635, %634 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg8, %636) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %637 = "mhlo.einsum"(%581, %129) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
    %638 = chlo.broadcast_multiply %283, %637 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    %639 = "tf.ReadVariableOp"(%arg13) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %640 = chlo.broadcast_subtract %639, %638 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    "tf.AssignVariableOp"(%arg13, %640) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
    %641 = "mhlo.einsum"(%546, %148) {einsum_config = "abe,abcd->cde"} : (tensor<1x512x768xf32>, tensor<1x512x12x64xf32>) -> tensor<12x64x768xf32>
    %642 = chlo.broadcast_multiply %283, %641 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
    %643 = "tf.ReadVariableOp"(%arg15) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
    %644 = chlo.broadcast_subtract %643, %642 : (tensor<12x64x768xf32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
    "tf.AssignVariableOp"(%arg15, %644) : (tensor<!tf.resource<tensor<12x64x768xf32>>>, tensor<12x64x768xf32>) -> ()
    %645 = chlo.broadcast_multiply %163, %530 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %646 = "mhlo.reduce"(%645, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %647 = chlo.broadcast_multiply %283, %646 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %648 = "tf.ReadVariableOp"(%arg17) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %649 = chlo.broadcast_subtract %648, %647 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg17, %649) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %650 = "mhlo.reduce"(%520, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %651 = chlo.broadcast_multiply %283, %650 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %652 = "tf.ReadVariableOp"(%arg18) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %653 = chlo.broadcast_subtract %652, %651 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg18, %653) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %654 = "mhlo.einsum"(%514, %168) {einsum_config = "abd,abc->cd"} : (tensor<1x512x3072xf32>, tensor<1x512x768xf32>) -> tensor<768x3072xf32>
    %655 = chlo.broadcast_multiply %283, %654 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
    %656 = "tf.ReadVariableOp"(%arg19) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
    %657 = chlo.broadcast_subtract %656, %655 : (tensor<768x3072xf32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
    "tf.AssignVariableOp"(%arg19, %657) : (tensor<!tf.resource<tensor<768x3072xf32>>>, tensor<768x3072xf32>) -> ()
    %658 = "mhlo.einsum"(%497, %178) {einsum_config = "abd,abc->cd"} : (tensor<1x512x768xf32>, tensor<1x512x3072xf32>) -> tensor<3072x768xf32>
    %659 = chlo.broadcast_multiply %283, %658 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
    %660 = "tf.ReadVariableOp"(%arg21) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
    %661 = chlo.broadcast_subtract %660, %659 : (tensor<3072x768xf32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
    "tf.AssignVariableOp"(%arg21, %661) : (tensor<!tf.resource<tensor<3072x768xf32>>>, tensor<3072x768xf32>) -> ()
    %662 = chlo.broadcast_multiply %193, %481 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %663 = "mhlo.reduce"(%662, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %664 = chlo.broadcast_multiply %283, %663 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %665 = "tf.ReadVariableOp"(%arg23) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %666 = chlo.broadcast_subtract %665, %664 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg23, %666) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %667 = "mhlo.reduce"(%471, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %668 = chlo.broadcast_multiply %283, %667 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %669 = "tf.ReadVariableOp"(%arg24) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %670 = chlo.broadcast_subtract %669, %668 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg24, %670) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %671 = "mhlo.einsum"(%463, %198) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
    %672 = chlo.broadcast_multiply %283, %671 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    %673 = "tf.ReadVariableOp"(%arg29) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %674 = chlo.broadcast_subtract %673, %672 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
    "tf.AssignVariableOp"(%arg29, %674) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
    %675 = "mhlo.einsum"(%428, %217) {einsum_config = "abe,abcd->cde"} : (tensor<1x512x768xf32>, tensor<1x512x12x64xf32>) -> tensor<12x64x768xf32>
    %676 = chlo.broadcast_multiply %283, %675 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
    %677 = "tf.ReadVariableOp"(%arg31) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
    %678 = chlo.broadcast_subtract %677, %676 : (tensor<12x64x768xf32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
    "tf.AssignVariableOp"(%arg31, %678) : (tensor<!tf.resource<tensor<12x64x768xf32>>>, tensor<12x64x768xf32>) -> ()
    %679 = chlo.broadcast_multiply %232, %412 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %680 = "mhlo.reduce"(%679, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %681 = chlo.broadcast_multiply %283, %680 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %682 = "tf.ReadVariableOp"(%arg33) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %683 = chlo.broadcast_subtract %682, %681 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg33, %683) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %684 = "mhlo.reduce"(%402, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %685 = chlo.broadcast_multiply %283, %684 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %686 = "tf.ReadVariableOp"(%arg34) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %687 = chlo.broadcast_subtract %686, %685 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg34, %687) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %688 = "mhlo.einsum"(%396, %237) {einsum_config = "abd,abc->cd"} : (tensor<1x512x3072xf32>, tensor<1x512x768xf32>) -> tensor<768x3072xf32>
    %689 = chlo.broadcast_multiply %283, %688 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
    %690 = "tf.ReadVariableOp"(%arg35) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
    %691 = chlo.broadcast_subtract %690, %689 : (tensor<768x3072xf32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
    "tf.AssignVariableOp"(%arg35, %691) : (tensor<!tf.resource<tensor<768x3072xf32>>>, tensor<768x3072xf32>) -> ()
    %692 = "mhlo.einsum"(%379, %247) {einsum_config = "abd,abc->cd"} : (tensor<1x512x768xf32>, tensor<1x512x3072xf32>) -> tensor<3072x768xf32>
    %693 = chlo.broadcast_multiply %283, %692 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
    %694 = "tf.ReadVariableOp"(%arg37) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
    %695 = chlo.broadcast_subtract %694, %693 : (tensor<3072x768xf32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
    "tf.AssignVariableOp"(%arg37, %695) : (tensor<!tf.resource<tensor<3072x768xf32>>>, tensor<3072x768xf32>) -> ()
    %696 = chlo.broadcast_multiply %262, %363 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %697 = "mhlo.reduce"(%696, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %698 = chlo.broadcast_multiply %283, %697 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %699 = "tf.ReadVariableOp"(%arg39) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %700 = chlo.broadcast_subtract %699, %698 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg39, %700) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %701 = "mhlo.reduce"(%352, %24) ( {
    ^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>):  // no predecessors
      %720 = mhlo.add %arg47, %arg48 : tensor<f32>
      "mhlo.return"(%720) : (tensor<f32>) -> ()
    }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
    %702 = chlo.broadcast_multiply %283, %701 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
    %703 = "tf.ReadVariableOp"(%arg40) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %704 = chlo.broadcast_subtract %703, %702 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
    "tf.AssignVariableOp"(%arg40, %704) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
    %705 = "mhlo.transpose"(%269) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x768xf32>) -> tensor<768x1xf32>
    %706 = "mhlo.dot"(%705, %344) : (tensor<768x1xf32>, tensor<1x768xf32>) -> tensor<768x768xf32>
    %707 = chlo.broadcast_multiply %283, %706 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x768xf32>) -> tensor<768x768xf32>
    %708 = "tf.ReadVariableOp"(%arg41) : (tensor<!tf.resource<tensor<768x768xf32>>>) -> tensor<768x768xf32>
    %709 = chlo.broadcast_subtract %708, %707 : (tensor<768x768xf32>, tensor<768x768xf32>) -> tensor<768x768xf32>
    "tf.AssignVariableOp"(%arg41, %709) : (tensor<!tf.resource<tensor<768x768xf32>>>, tensor<768x768xf32>) -> ()
    %710 = "mhlo.transpose"(%275) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x768xf32>) -> tensor<768x1xf32>
    %711 = "mhlo.dot"(%710, %333) : (tensor<768x1xf32>, tensor<1x5xf32>) -> tensor<768x5xf32>
    %712 = chlo.broadcast_multiply %283, %711 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x5xf32>) -> tensor<768x5xf32>
    %713 = "tf.ReadVariableOp"(%arg43) : (tensor<!tf.resource<tensor<768x5xf32>>>) -> tensor<768x5xf32>
    %714 = chlo.broadcast_subtract %713, %712 : (tensor<768x5xf32>, tensor<768x5xf32>) -> tensor<768x5xf32>
    "tf.AssignVariableOp"(%arg43, %714) : (tensor<!tf.resource<tensor<768x5xf32>>>, tensor<768x5xf32>) -> ()
    %715 = "tf.ReadVariableOp"(%arg46) : (tensor<!tf.resource<tensor<i64>>>) -> tensor<i64>
    %716 = chlo.broadcast_add %715, %0 : (tensor<i64>, tensor<i64>) -> tensor<i64>
    "tf.AssignVariableOp"(%arg46, %716) : (tensor<!tf.resource<tensor<i64>>>, tensor<i64>) -> ()
    %717 = chlo.broadcast_compare %23, %24 {comparison_direction = "EQ"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
    %718 = chlo.broadcast_divide %324, %23 : (tensor<f32>, tensor<f32>) -> tensor<f32>
    %719 = chlo.broadcast_select %717, %24, %718 : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
    return %719 : tensor<f32>
  }
  func @"__inference_<lambda>_32400"(%arg0: tensor<1x512xi32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0, 0]}, %arg1: tensor<1x512xi32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0, 1]}, %arg2: tensor<1x512xi32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0, 2]}, %arg3: tensor<!tf.resource<tensor<100x768xf32>>> {tf_saved_model.bound_input = @"__sm_node45__m.layer-3.layer-1.embeddings"}, %arg4: tensor<!tf.resource<tensor<512x768xf32>>> {tf_saved_model.bound_input = @"__sm_node46__m.layer-3.layer-3.embeddings"}, %arg5: tensor<!tf.resource<tensor<16x768xf32>>> {tf_saved_model.bound_input = @"__sm_node47__m.layer-3.layer-4.embeddings"}, %arg6: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node48__m.layer-3.layer-6.gamma"}, %arg7: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node49__m.layer-3.layer-6.beta"}, %arg8: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node50__m.layer-3.layer-10._attention_layer._query_dense.kernel"}, %arg9: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node51__m.layer-3.layer-10._attention_layer._query_dense.bias"}, %arg10: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node52__m.layer-3.layer-10._attention_layer._key_dense.kernel"}, %arg11: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node53__m.layer-3.layer-10._attention_layer._key_dense.bias"}, %arg12: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node54__m.layer-3.layer-10._attention_layer._value_dense.kernel"}, %arg13: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node55__m.layer-3.layer-10._attention_layer._value_dense.bias"}, %arg14: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node56__m.layer-3.layer-10._attention_layer._output_dense.kernel"}, %arg15: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node57__m.layer-3.layer-10._attention_layer._output_dense.bias"}, %arg16: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node58__m.layer-3.layer-10.keras_api.layers.2.gamma"}, %arg17: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node59__m.layer-3.layer-10.keras_api.layers.2.beta"}, %arg18: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node60__m.layer-3.layer-10.keras_api.layers.3.kernel"}, %arg19: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node61__m.layer-3.layer-10.keras_api.layers.3.bias"}, %arg20: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node62__m.layer-3.layer-10._output_dense.kernel"}, %arg21: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node63__m.layer-3.layer-10._output_dense.bias"}, %arg22: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node64__m.layer-3.layer-10._output_layer_norm.gamma"}, %arg23: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node65__m.layer-3.layer-10._output_layer_norm.beta"}, %arg24: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node66__m.layer-3.layer-11._attention_layer._query_dense.kernel"}, %arg25: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node67__m.layer-3.layer-11._attention_layer._query_dense.bias"}, %arg26: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node68__m.layer-3.layer-11._attention_layer._key_dense.kernel"}, %arg27: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node69__m.layer-3.layer-11._attention_layer._key_dense.bias"}, %arg28: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node70__m.layer-3.layer-11._attention_layer._value_dense.kernel"}, %arg29: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node71__m.layer-3.layer-11._attention_layer._value_dense.bias"}, %arg30: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node72__m.layer-3.layer-11._attention_layer._output_dense.kernel"}, %arg31: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node73__m.layer-3.layer-11._attention_layer._output_dense.bias"}, %arg32: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node74__m.layer-3.layer-11.keras_api.layers.2.gamma"}, %arg33: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node75__m.layer-3.layer-11.keras_api.layers.2.beta"}, %arg34: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node76__m.layer-3.layer-11.keras_api.layers.3.kernel"}, %arg35: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node77__m.layer-3.layer-11.keras_api.layers.3.bias"}, %arg36: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node78__m.layer-3.layer-11._output_dense.kernel"}, %arg37: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node79__m.layer-3.layer-11._output_dense.bias"}, %arg38: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node80__m.layer-3.layer-11._output_layer_norm.gamma"}, %arg39: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node81__m.layer-3.layer-11._output_layer_norm.beta"}, %arg40: tensor<!tf.resource<tensor<768x768xf32>>> {tf_saved_model.bound_input = @"__sm_node82__m.layer-3.layer-13.kernel"}, %arg41: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node83__m.layer-3.layer-13.bias"}, %arg42: tensor<!tf.resource<tensor<768x5xf32>>> {tf_saved_model.bound_input = @"__sm_node84__m.layer-5.out_proj.kernel"}, %arg43: tensor<!tf.resource<tensor<5xf32>>> {tf_saved_model.bound_input = @"__sm_node85__m.layer-5.out_proj.bias"}) -> (tensor<1x5xf32> {tf_saved_model.index_path = []}) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful, tf_saved_model.exported_names = ["predict"]} {
    %0 = mhlo.constant dense<-1.000000e+09> : tensor<f32>
    %1 = mhlo.constant dense<1.250000e-01> : tensor<f32>
    %2 = mhlo.constant dense<0.797884583> : tensor<f32>
    %3 = mhlo.constant dense<5.000000e-01> : tensor<f32>
    %4 = mhlo.constant dense<1.000000e+00> : tensor<f32>
    %5 = mhlo.constant dense<3.000000e+00> : tensor<f32>
    %6 = mhlo.constant dense<4.471500e-02> : tensor<f32>
    %7 = mhlo.constant dense<1.000000e+00> : tensor<1x512x1xf32>
    %8 = mhlo.constant dense<9.99999996E-13> : tensor<f32>
    %9 = mhlo.constant dense<0xFF800000> : tensor<f32>
    %10 = mhlo.constant dense<0.000000e+00> : tensor<f32>
    %11 = mhlo.constant dense<7.680000e+02> : tensor<f32>
    %12 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %13 = "tf.ReadVariableOp"(%arg7) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %14 = "tf.ReadVariableOp"(%arg41) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %15 = "tf.ReadVariableOp"(%arg40) : (tensor<!tf.resource<tensor<768x768xf32>>>) -> tensor<768x768xf32>
    %16 = "tf.ReadVariableOp"(%arg4) : (tensor<!tf.resource<tensor<512x768xf32>>>) -> tensor<512x768xf32>
    %17 = "mhlo.reshape"(%16) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
    %18 = "tf.ReadVariableOp"(%arg19) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
    %19 = "tf.ReadVariableOp"(%arg18) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
    %20 = "tf.ReadVariableOp"(%arg21) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %21 = "tf.ReadVariableOp"(%arg20) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
    %22 = "tf.ReadVariableOp"(%arg22) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %23 = "tf.ReadVariableOp"(%arg23) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %24 = "tf.ReadVariableOp"(%arg15) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %25 = "tf.ReadVariableOp"(%arg14) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
    %26 = "tf.ReadVariableOp"(%arg11) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %27 = "tf.ReadVariableOp"(%arg10) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %28 = "tf.ReadVariableOp"(%arg16) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %29 = "tf.ReadVariableOp"(%arg17) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %30 = "tf.ReadVariableOp"(%arg9) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %31 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %32 = "tf.ReadVariableOp"(%arg13) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %33 = "tf.ReadVariableOp"(%arg12) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %34 = "tf.ReadVariableOp"(%arg35) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
    %35 = "tf.ReadVariableOp"(%arg34) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
    %36 = "tf.ReadVariableOp"(%arg37) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %37 = "tf.ReadVariableOp"(%arg36) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
    %38 = "tf.ReadVariableOp"(%arg38) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %39 = "tf.ReadVariableOp"(%arg39) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %40 = "tf.ReadVariableOp"(%arg31) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %41 = "tf.ReadVariableOp"(%arg30) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
    %42 = "tf.ReadVariableOp"(%arg27) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %43 = "tf.ReadVariableOp"(%arg26) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %44 = "tf.ReadVariableOp"(%arg32) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %45 = "tf.ReadVariableOp"(%arg33) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
    %46 = "tf.ReadVariableOp"(%arg25) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %47 = "tf.ReadVariableOp"(%arg24) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %48 = "tf.ReadVariableOp"(%arg29) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
    %49 = "tf.ReadVariableOp"(%arg28) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
    %50 = "tf.ReadVariableOp"(%arg5) : (tensor<!tf.resource<tensor<16x768xf32>>>) -> tensor<16x768xf32>
    %51 = "tf.ReadVariableOp"(%arg43) : (tensor<!tf.resource<tensor<5xf32>>>) -> tensor<5xf32>
    %52 = "tf.ReadVariableOp"(%arg42) : (tensor<!tf.resource<tensor<768x5xf32>>>) -> tensor<768x5xf32>
    %53 = "mhlo.reshape"(%arg0) : (tensor<1x512xi32>) -> tensor<512xi32>
    %54 = "tf.ReadVariableOp"(%arg3) : (tensor<!tf.resource<tensor<100x768xf32>>>) -> tensor<100x768xf32>
    %55 = "mhlo.torch_index_select"(%54, %53) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<100x768xf32>, tensor<512xi32>) -> tensor<512x768xf32>
    %56 = "mhlo.reshape"(%55) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
    %57 = chlo.broadcast_add %56, %17 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %58 = "mhlo.reshape"(%arg1) : (tensor<1x512xi32>) -> tensor<1x1x512xi32>
    %59 = "mhlo.convert"(%58) : (tensor<1x1x512xi32>) -> tensor<1x1x512xf32>
    %60 = chlo.broadcast_multiply %59, %7 : (tensor<1x1x512xf32>, tensor<1x512x1xf32>) -> tensor<1x512x512xf32>
    %61 = "mhlo.reshape"(%60) : (tensor<1x512x512xf32>) -> tensor<1x1x512x512xf32>
    %62 = chlo.broadcast_subtract %4, %61 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x1x512x512xf32>) -> tensor<1x1x512x512xf32>
    %63 = chlo.broadcast_multiply %62, %0 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x512x512xf32>, tensor<f32>) -> tensor<1x1x512x512xf32>
    %64 = "mhlo.reshape"(%arg2) : (tensor<1x512xi32>) -> tensor<512xi32>
    %65 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<16xi32>
    %66 = "mhlo.broadcast_in_dim"(%65) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<16xi32>) -> tensor<512x16xi32>
    %67 = "mhlo.broadcast_in_dim"(%64) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<512xi32>) -> tensor<512x16xi32>
    %68 = "mhlo.compare"(%67, %66) {comparison_direction = "EQ"} : (tensor<512x16xi32>, tensor<512x16xi32>) -> tensor<512x16xi1>
    %69 = "mhlo.broadcast"(%4) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
    %70 = "mhlo.broadcast"(%10) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
    %71 = "mhlo.select"(%68, %69, %70) : (tensor<512x16xi1>, tensor<512x16xf32>, tensor<512x16xf32>) -> tensor<512x16xf32>
    %72 = "mhlo.dot"(%71, %50) : (tensor<512x16xf32>, tensor<16x768xf32>) -> tensor<512x768xf32>
    %73 = "mhlo.reshape"(%72) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
    %74 = chlo.broadcast_add %57, %73 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %75 = "mhlo.reduce"(%74, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %76 = chlo.broadcast_divide %75, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %77 = "mhlo.reshape"(%76) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %78 = chlo.broadcast_subtract %74, %77 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %79 = chlo.broadcast_multiply %78, %78 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %80 = "mhlo.reduce"(%79, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %81 = chlo.broadcast_divide %80, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %82 = "mhlo.reshape"(%81) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %83 = chlo.broadcast_add %82, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %84 = "mhlo.rsqrt"(%83) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %85 = chlo.broadcast_multiply %84, %12 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %86 = chlo.broadcast_multiply %74, %85 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %87 = chlo.broadcast_multiply %77, %85 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %88 = chlo.broadcast_subtract %13, %87 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %89 = chlo.broadcast_add %86, %88 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %90 = "mhlo.einsum"(%89, %27) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %91 = chlo.broadcast_add %90, %26 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %92 = "mhlo.einsum"(%89, %31) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %93 = chlo.broadcast_add %92, %30 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %94 = chlo.broadcast_multiply %93, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
    %95 = "mhlo.einsum"(%91, %94) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
    %96 = chlo.broadcast_add %95, %63 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
    %97 = "mhlo.reduce"(%96, %9) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.maximum %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %98 = "mhlo.broadcast_in_dim"(%97) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %99 = mhlo.subtract %96, %98 : tensor<1x12x512x512xf32>
    %100 = "mhlo.exponential"(%99) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %101 = "mhlo.reduce"(%100, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %102 = "mhlo.broadcast_in_dim"(%101) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %103 = mhlo.divide %100, %102 : tensor<1x12x512x512xf32>
    %104 = "mhlo.einsum"(%89, %33) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %105 = chlo.broadcast_add %104, %32 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %106 = "mhlo.einsum"(%103, %105) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %107 = "mhlo.einsum"(%106, %25) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
    %108 = chlo.broadcast_add %107, %24 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %109 = chlo.broadcast_add %89, %108 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %110 = "mhlo.reduce"(%109, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %111 = chlo.broadcast_divide %110, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %112 = "mhlo.reshape"(%111) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %113 = chlo.broadcast_subtract %109, %112 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %114 = chlo.broadcast_multiply %113, %113 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %115 = "mhlo.reduce"(%114, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %116 = chlo.broadcast_divide %115, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %117 = "mhlo.reshape"(%116) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %118 = chlo.broadcast_add %117, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %119 = "mhlo.rsqrt"(%118) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %120 = chlo.broadcast_multiply %119, %28 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %121 = chlo.broadcast_multiply %109, %120 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %122 = chlo.broadcast_multiply %112, %120 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %123 = chlo.broadcast_subtract %29, %122 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %124 = chlo.broadcast_add %121, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %125 = "mhlo.einsum"(%124, %19) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
    %126 = chlo.broadcast_add %125, %18 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
    %127 = chlo.broadcast_power %126, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %128 = chlo.broadcast_multiply %127, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %129 = chlo.broadcast_add %126, %128 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %130 = chlo.broadcast_multiply %129, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %131 = "mhlo.tanh"(%130) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %132 = chlo.broadcast_add %131, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %133 = chlo.broadcast_multiply %126, %3 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %134 = chlo.broadcast_multiply %133, %132 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %135 = "mhlo.einsum"(%134, %21) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
    %136 = chlo.broadcast_add %135, %20 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %137 = chlo.broadcast_add %136, %124 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %138 = "mhlo.reduce"(%137, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %139 = chlo.broadcast_divide %138, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %140 = "mhlo.reshape"(%139) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %141 = chlo.broadcast_subtract %137, %140 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %142 = chlo.broadcast_multiply %141, %141 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %143 = "mhlo.reduce"(%142, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %144 = chlo.broadcast_divide %143, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %145 = "mhlo.reshape"(%144) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %146 = chlo.broadcast_add %145, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %147 = "mhlo.rsqrt"(%146) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %148 = chlo.broadcast_multiply %147, %22 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %149 = chlo.broadcast_multiply %137, %148 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %150 = chlo.broadcast_multiply %140, %148 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %151 = chlo.broadcast_subtract %23, %150 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %152 = chlo.broadcast_add %149, %151 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %153 = "mhlo.einsum"(%152, %43) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %154 = chlo.broadcast_add %153, %42 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %155 = "mhlo.einsum"(%152, %47) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %156 = chlo.broadcast_add %155, %46 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %157 = chlo.broadcast_multiply %156, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
    %158 = "mhlo.einsum"(%154, %157) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
    %159 = chlo.broadcast_add %158, %63 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
    %160 = "mhlo.reduce"(%159, %9) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.maximum %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %161 = "mhlo.broadcast_in_dim"(%160) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %162 = mhlo.subtract %159, %161 : tensor<1x12x512x512xf32>
    %163 = "mhlo.exponential"(%162) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
    %164 = "mhlo.reduce"(%163, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
    %165 = "mhlo.broadcast_in_dim"(%164) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
    %166 = mhlo.divide %163, %165 : tensor<1x12x512x512xf32>
    %167 = "mhlo.einsum"(%152, %49) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
    %168 = chlo.broadcast_add %167, %48 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
    %169 = "mhlo.einsum"(%166, %168) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
    %170 = "mhlo.einsum"(%169, %41) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
    %171 = chlo.broadcast_add %170, %40 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %172 = chlo.broadcast_add %152, %171 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %173 = "mhlo.reduce"(%172, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %174 = chlo.broadcast_divide %173, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %175 = "mhlo.reshape"(%174) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %176 = chlo.broadcast_subtract %172, %175 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %177 = chlo.broadcast_multiply %176, %176 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %178 = "mhlo.reduce"(%177, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %179 = chlo.broadcast_divide %178, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %180 = "mhlo.reshape"(%179) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %181 = chlo.broadcast_add %180, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %182 = "mhlo.rsqrt"(%181) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %183 = chlo.broadcast_multiply %182, %44 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %184 = chlo.broadcast_multiply %172, %183 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %185 = chlo.broadcast_multiply %175, %183 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %186 = chlo.broadcast_subtract %45, %185 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %187 = chlo.broadcast_add %184, %186 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %188 = "mhlo.einsum"(%187, %35) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
    %189 = chlo.broadcast_add %188, %34 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
    %190 = chlo.broadcast_power %189, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %191 = chlo.broadcast_multiply %190, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %192 = chlo.broadcast_add %189, %191 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %193 = chlo.broadcast_multiply %192, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %194 = "mhlo.tanh"(%193) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %195 = chlo.broadcast_add %194, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %196 = chlo.broadcast_multiply %189, %3 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
    %197 = chlo.broadcast_multiply %196, %195 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
    %198 = "mhlo.einsum"(%197, %37) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
    %199 = chlo.broadcast_add %198, %36 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %200 = chlo.broadcast_add %199, %187 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %201 = "mhlo.reduce"(%200, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %202 = chlo.broadcast_divide %201, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %203 = "mhlo.reshape"(%202) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %204 = chlo.broadcast_subtract %200, %203 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
    %205 = chlo.broadcast_multiply %204, %204 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %206 = "mhlo.reduce"(%205, %10) ( {
    ^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>):  // no predecessors
      %225 = mhlo.add %arg44, %arg45 : tensor<f32>
      "mhlo.return"(%225) : (tensor<f32>) -> ()
    }) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
    %207 = chlo.broadcast_divide %206, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
    %208 = "mhlo.reshape"(%207) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
    %209 = chlo.broadcast_add %208, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
    %210 = "mhlo.rsqrt"(%209) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
    %211 = chlo.broadcast_multiply %210, %38 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
    %212 = chlo.broadcast_multiply %200, %211 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %213 = chlo.broadcast_multiply %203, %211 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %214 = chlo.broadcast_subtract %39, %213 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %215 = chlo.broadcast_add %212, %214 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
    %216 = "mhlo.slice"(%215) {limit_indices = dense<[1, 1, 768]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x512x768xf32>) -> tensor<1x1x768xf32>
    %217 = "mhlo.reshape"(%216) : (tensor<1x1x768xf32>) -> tensor<1x768xf32>
    %218 = "mhlo.dot"(%217, %15) : (tensor<1x768xf32>, tensor<768x768xf32>) -> tensor<1x768xf32>
    %219 = "mhlo.broadcast_in_dim"(%14) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<768xf32>) -> tensor<1x768xf32>
    %220 = mhlo.add %218, %219 : tensor<1x768xf32>
    %221 = "mhlo.tanh"(%220) : (tensor<1x768xf32>) -> tensor<1x768xf32>
    %222 = "mhlo.dot"(%221, %52) : (tensor<1x768xf32>, tensor<768x5xf32>) -> tensor<1x5xf32>
    %223 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<1x5xf32>
    %224 = mhlo.add %222, %223 : tensor<1x5xf32>
    return %224 : tensor<1x5xf32>
  }
}


/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:725:0: error: could not lower resource op to flow: tf.ResourceScatterAdd
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:678:0: note: called from
/usr/local/google/home/laurenzo/src/ModelCompiler/nlp_gen/bert_gen.py:61:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:983:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3983:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py:668:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:1007:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3291:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3456:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3109:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:725:0: note: see current operation: "tf.ResourceScatterAdd"(%0, %134, %670) {_class = ["loc:@bert_classifier/bert_encoder_1/word_embeddings/Gather/resource"], device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (!iree.ptr<tensor<100x768xf32>>, tensor<512xi32>, tensor<512x768xf32>) -> ()