Skip to content

Instantly share code, notes, and snippets.

@stellaraccident
Created June 16, 2021 20:46
Show Gist options
  • Save stellaraccident/30ffddc028b0941ec6c2c3a7a8b71290 to your computer and use it in GitHub Desktop.
Save stellaraccident/30ffddc028b0941ec6c2c3a7a8b71290 to your computer and use it in GitHub Desktop.
// -----// IR Dump Before mlir::iree_integrations::TF::LowerGlobalTensors //----- //
module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 779 : i32}, tf_saved_model.semantics} {
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node15__optimizer.iter", tf_saved_model.exported_names = [], type = tensor<i64>, value = dense<0> : tensor<i64>} : () -> ()
"tf_saved_model.global_tensor"() {sym_name = "__sm_node17__optimizer.learning_rate", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<0.00999999977> : tensor<f32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node45__m.layer-3.layer-1.embeddings", tf_saved_model.exported_names = [], type = tensor<100x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<100x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node46__m.layer-3.layer-3.embeddings", tf_saved_model.exported_names = [], type = tensor<512x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<512x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node47__m.layer-3.layer-4.embeddings", tf_saved_model.exported_names = [], type = tensor<16x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<16x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node48__m.layer-3.layer-6.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node49__m.layer-3.layer-6.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node50__m.layer-3.layer-10._attention_layer._query_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node51__m.layer-3.layer-10._attention_layer._query_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node52__m.layer-3.layer-10._attention_layer._key_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node53__m.layer-3.layer-10._attention_layer._key_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node54__m.layer-3.layer-10._attention_layer._value_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node55__m.layer-3.layer-10._attention_layer._value_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node56__m.layer-3.layer-10._attention_layer._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<12x64x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<12x64x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node57__m.layer-3.layer-10._attention_layer._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node58__m.layer-3.layer-10.keras_api.layers.2.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node59__m.layer-3.layer-10.keras_api.layers.2.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node60__m.layer-3.layer-10.keras_api.layers.3.kernel", tf_saved_model.exported_names = [], type = tensor<768x3072xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x3072xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node61__m.layer-3.layer-10.keras_api.layers.3.bias", tf_saved_model.exported_names = [], type = tensor<3072xf32>, value = dense<0.000000e+00> : tensor<3072xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node62__m.layer-3.layer-10._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<3072x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<3072x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node63__m.layer-3.layer-10._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node64__m.layer-3.layer-10._output_layer_norm.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node65__m.layer-3.layer-10._output_layer_norm.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node66__m.layer-3.layer-11._attention_layer._query_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node67__m.layer-3.layer-11._attention_layer._query_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node68__m.layer-3.layer-11._attention_layer._key_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node69__m.layer-3.layer-11._attention_layer._key_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node70__m.layer-3.layer-11._attention_layer._value_dense.kernel", tf_saved_model.exported_names = [], type = tensor<768x12x64xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node71__m.layer-3.layer-11._attention_layer._value_dense.bias", tf_saved_model.exported_names = [], type = tensor<12x64xf32>, value = dense<0.000000e+00> : tensor<12x64xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node72__m.layer-3.layer-11._attention_layer._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<12x64x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<12x64x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node73__m.layer-3.layer-11._attention_layer._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node74__m.layer-3.layer-11.keras_api.layers.2.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node75__m.layer-3.layer-11.keras_api.layers.2.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node76__m.layer-3.layer-11.keras_api.layers.3.kernel", tf_saved_model.exported_names = [], type = tensor<768x3072xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x3072xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node77__m.layer-3.layer-11.keras_api.layers.3.bias", tf_saved_model.exported_names = [], type = tensor<3072xf32>, value = dense<0.000000e+00> : tensor<3072xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node78__m.layer-3.layer-11._output_dense.kernel", tf_saved_model.exported_names = [], type = tensor<3072x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<3072x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node79__m.layer-3.layer-11._output_dense.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node80__m.layer-3.layer-11._output_layer_norm.gamma", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<1.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node81__m.layer-3.layer-11._output_layer_norm.beta", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node82__m.layer-3.layer-13.kernel", tf_saved_model.exported_names = [], type = tensor<768x768xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node83__m.layer-3.layer-13.bias", tf_saved_model.exported_names = [], type = tensor<768xf32>, value = dense<0.000000e+00> : tensor<768xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node84__m.layer-5.out_proj.kernel", tf_saved_model.exported_names = [], type = tensor<768x5xf32>, value = opaque<"_", "0xDEADBEEF"> : tensor<768x5xf32>} : () -> ()
"tf_saved_model.global_tensor"() {is_mutable, sym_name = "__sm_node85__m.layer-5.out_proj.bias", tf_saved_model.exported_names = [], type = tensor<5xf32>, value = dense<0.000000e+00> : tensor<5xf32>} : () -> ()
func @__inference_learn_29190(%arg0: tensor<1x512xi32> {tf._user_specified_name = "inputs", tf_saved_model.index_path = [0, 0, 0]}, %arg1: tensor<1x512xi32> {tf._user_specified_name = "inputs", tf_saved_model.index_path = [0, 0, 1]}, %arg2: tensor<1x512xi32> {tf._user_specified_name = "inputs", tf_saved_model.index_path = [0, 0, 2]}, %arg3: tensor<1xi32> {tf._user_specified_name = "labels", tf_saved_model.index_path = [1]}, %arg4: tensor<!tf.resource<tensor<100x768xf32>>> {tf_saved_model.bound_input = @"__sm_node45__m.layer-3.layer-1.embeddings"}, %arg5: tensor<!tf.resource<tensor<512x768xf32>>> {tf_saved_model.bound_input = @"__sm_node46__m.layer-3.layer-3.embeddings"}, %arg6: tensor<!tf.resource<tensor<16x768xf32>>> {tf_saved_model.bound_input = @"__sm_node47__m.layer-3.layer-4.embeddings"}, %arg7: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node48__m.layer-3.layer-6.gamma"}, %arg8: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node49__m.layer-3.layer-6.beta"}, %arg9: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node50__m.layer-3.layer-10._attention_layer._query_dense.kernel"}, %arg10: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node51__m.layer-3.layer-10._attention_layer._query_dense.bias"}, %arg11: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node52__m.layer-3.layer-10._attention_layer._key_dense.kernel"}, %arg12: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node53__m.layer-3.layer-10._attention_layer._key_dense.bias"}, %arg13: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node54__m.layer-3.layer-10._attention_layer._value_dense.kernel"}, %arg14: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node55__m.layer-3.layer-10._attention_layer._value_dense.bias"}, %arg15: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node56__m.layer-3.layer-10._attention_layer._output_dense.kernel"}, %arg16: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node57__m.layer-3.layer-10._attention_layer._output_dense.bias"}, %arg17: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node58__m.layer-3.layer-10.keras_api.layers.2.gamma"}, %arg18: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node59__m.layer-3.layer-10.keras_api.layers.2.beta"}, %arg19: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node60__m.layer-3.layer-10.keras_api.layers.3.kernel"}, %arg20: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node61__m.layer-3.layer-10.keras_api.layers.3.bias"}, %arg21: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node62__m.layer-3.layer-10._output_dense.kernel"}, %arg22: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node63__m.layer-3.layer-10._output_dense.bias"}, %arg23: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node64__m.layer-3.layer-10._output_layer_norm.gamma"}, %arg24: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node65__m.layer-3.layer-10._output_layer_norm.beta"}, %arg25: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node66__m.layer-3.layer-11._attention_layer._query_dense.kernel"}, %arg26: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node67__m.layer-3.layer-11._attention_layer._query_dense.bias"}, %arg27: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node68__m.layer-3.layer-11._attention_layer._key_dense.kernel"}, %arg28: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node69__m.layer-3.layer-11._attention_layer._key_dense.bias"}, %arg29: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node70__m.layer-3.layer-11._attention_layer._value_dense.kernel"}, %arg30: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node71__m.layer-3.layer-11._attention_layer._value_dense.bias"}, %arg31: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node72__m.layer-3.layer-11._attention_layer._output_dense.kernel"}, %arg32: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node73__m.layer-3.layer-11._attention_layer._output_dense.bias"}, %arg33: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node74__m.layer-3.layer-11.keras_api.layers.2.gamma"}, %arg34: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node75__m.layer-3.layer-11.keras_api.layers.2.beta"}, %arg35: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node76__m.layer-3.layer-11.keras_api.layers.3.kernel"}, %arg36: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node77__m.layer-3.layer-11.keras_api.layers.3.bias"}, %arg37: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node78__m.layer-3.layer-11._output_dense.kernel"}, %arg38: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node79__m.layer-3.layer-11._output_dense.bias"}, %arg39: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node80__m.layer-3.layer-11._output_layer_norm.gamma"}, %arg40: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node81__m.layer-3.layer-11._output_layer_norm.beta"}, %arg41: tensor<!tf.resource<tensor<768x768xf32>>> {tf_saved_model.bound_input = @"__sm_node82__m.layer-3.layer-13.kernel"}, %arg42: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node83__m.layer-3.layer-13.bias"}, %arg43: tensor<!tf.resource<tensor<768x5xf32>>> {tf_saved_model.bound_input = @"__sm_node84__m.layer-5.out_proj.kernel"}, %arg44: tensor<!tf.resource<tensor<5xf32>>> {tf_saved_model.bound_input = @"__sm_node85__m.layer-5.out_proj.bias"}, %arg45: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @__sm_node17__optimizer.learning_rate}, %arg46: tensor<!tf.resource<tensor<i64>>> {tf_saved_model.bound_input = @__sm_node15__optimizer.iter}) -> (tensor<f32> {tf_saved_model.index_path = []}) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<1>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful, tf_saved_model.exported_names = ["learn"]} {
%0 = mhlo.constant dense<1> : tensor<i64>
%1 = mhlo.constant dense<2.000000e+00> : tensor<f32>
%2 = mhlo.constant dense<1.000000e-07> : tensor<f32>
%3 = mhlo.constant dense<0.000000e+00> : tensor<1x5xf32>
%4 = mhlo.constant dense<-1.000000e+09> : tensor<f32>
%5 = mhlo.constant dense<1.250000e-01> : tensor<f32>
%6 = mhlo.constant dense<0.797884583> : tensor<f32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<f32>
%8 = mhlo.constant dense<3.000000e+00> : tensor<f32>
%9 = mhlo.constant dense<4.471500e-02> : tensor<f32>
%10 = mhlo.constant dense<1.000000e+00> : tensor<1x512x1xf32>
%11 = mhlo.constant dense<9.99999996E-13> : tensor<f32>
%12 = mhlo.constant dense<1.000000e-01> : tensor<f32>
%13 = mhlo.constant dense<1.11111116> : tensor<f32>
%14 = mhlo.constant dense<0> : tensor<i64>
%15 = mhlo.constant dense<5> : tensor<i64>
%16 = mhlo.constant dense<0x7FC00000> : tensor<f32>
%17 = mhlo.constant dense<-2.000000e+00> : tensor<f32>
%18 = mhlo.constant dense<[1, 512, 768]> : tensor<3xi64>
%19 = mhlo.constant dense<[1, 12, 512, 512]> : tensor<4xi64>
%20 = mhlo.constant dense<[1, 768]> : tensor<2xi64>
%21 = mhlo.constant dense<7.680000e+02> : tensor<f32>
%22 = mhlo.constant dense<0xFF800000> : tensor<f32>
%23 = mhlo.constant dense<1.000000e+00> : tensor<f32>
%24 = mhlo.constant dense<0.000000e+00> : tensor<f32>
%25 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
%26 = chlo.broadcast_compare %25, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
%27 = "mhlo.convert"(%26) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
%28 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
%29 = chlo.broadcast_compare %28, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
%30 = "mhlo.convert"(%29) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
%31 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
%32 = chlo.broadcast_compare %31, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
%33 = "mhlo.convert"(%32) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
%34 = "mhlo.rng_uniform"(%24, %23, %19) : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<1x12x512x512xf32>
%35 = chlo.broadcast_compare %34, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xi1>
%36 = "mhlo.convert"(%35) : (tensor<1x12x512x512xi1>) -> tensor<1x12x512x512xf32>
%37 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
%38 = chlo.broadcast_compare %37, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
%39 = "mhlo.convert"(%38) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
%40 = "mhlo.rng_uniform"(%24, %23, %18) : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<1x512x768xf32>
%41 = chlo.broadcast_compare %40, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xi1>
%42 = "mhlo.convert"(%41) : (tensor<1x512x768xi1>) -> tensor<1x512x768xf32>
%43 = "mhlo.rng_uniform"(%24, %23, %19) : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<1x12x512x512xf32>
%44 = chlo.broadcast_compare %43, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xi1>
%45 = "mhlo.convert"(%44) : (tensor<1x12x512x512xi1>) -> tensor<1x12x512x512xf32>
%46 = "mhlo.rng_uniform"(%24, %23, %20) : (tensor<f32>, tensor<f32>, tensor<2xi64>) -> tensor<1x768xf32>
%47 = chlo.broadcast_compare %46, %12 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x768xf32>, tensor<f32>) -> tensor<1x768xi1>
%48 = "mhlo.convert"(%47) : (tensor<1x768xi1>) -> tensor<1x768xf32>
%49 = "tf.ReadVariableOp"(%arg7) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%50 = "tf.ReadVariableOp"(%arg8) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%51 = "tf.ReadVariableOp"(%arg42) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%52 = "tf.ReadVariableOp"(%arg41) {device = ""} : (tensor<!tf.resource<tensor<768x768xf32>>>) -> tensor<768x768xf32>
%53 = "tf.ReadVariableOp"(%arg5) {device = ""} : (tensor<!tf.resource<tensor<512x768xf32>>>) -> tensor<512x768xf32>
%54 = "mhlo.reshape"(%53) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
%55 = "tf.ReadVariableOp"(%arg20) {device = ""} : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
%56 = "tf.ReadVariableOp"(%arg19) {device = ""} : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
%57 = "tf.ReadVariableOp"(%arg22) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%58 = "tf.ReadVariableOp"(%arg21) {device = ""} : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
%59 = "tf.ReadVariableOp"(%arg23) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%60 = "tf.ReadVariableOp"(%arg24) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%61 = "tf.ReadVariableOp"(%arg16) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%62 = "tf.ReadVariableOp"(%arg15) {device = ""} : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
%63 = "tf.ReadVariableOp"(%arg12) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%64 = "tf.ReadVariableOp"(%arg11) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%65 = "tf.ReadVariableOp"(%arg17) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%66 = "tf.ReadVariableOp"(%arg18) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%67 = "tf.ReadVariableOp"(%arg10) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%68 = "tf.ReadVariableOp"(%arg9) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%69 = "tf.ReadVariableOp"(%arg14) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%70 = "tf.ReadVariableOp"(%arg13) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%71 = "tf.ReadVariableOp"(%arg36) {device = ""} : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
%72 = "tf.ReadVariableOp"(%arg35) {device = ""} : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
%73 = "tf.ReadVariableOp"(%arg38) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%74 = "tf.ReadVariableOp"(%arg37) {device = ""} : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
%75 = "tf.ReadVariableOp"(%arg39) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%76 = "tf.ReadVariableOp"(%arg40) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%77 = "tf.ReadVariableOp"(%arg32) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%78 = "tf.ReadVariableOp"(%arg31) {device = ""} : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
%79 = "tf.ReadVariableOp"(%arg28) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%80 = "tf.ReadVariableOp"(%arg27) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%81 = "tf.ReadVariableOp"(%arg33) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%82 = "tf.ReadVariableOp"(%arg34) {device = ""} : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%83 = "tf.ReadVariableOp"(%arg26) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%84 = "tf.ReadVariableOp"(%arg25) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%85 = "tf.ReadVariableOp"(%arg30) {device = ""} : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%86 = "tf.ReadVariableOp"(%arg29) {device = ""} : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%87 = "tf.ReadVariableOp"(%arg6) {device = ""} : (tensor<!tf.resource<tensor<16x768xf32>>>) -> tensor<16x768xf32>
%88 = "tf.ReadVariableOp"(%arg44) {device = ""} : (tensor<!tf.resource<tensor<5xf32>>>) -> tensor<5xf32>
%89 = "tf.ReadVariableOp"(%arg43) {device = ""} : (tensor<!tf.resource<tensor<768x5xf32>>>) -> tensor<768x5xf32>
%90 = chlo.broadcast_subtract %8, %23 : (tensor<f32>, tensor<f32>) -> tensor<f32>
%91 = "mhlo.reshape"(%arg0) : (tensor<1x512xi32>) -> tensor<512xi32>
%92 = "tf.ReadVariableOp"(%arg4) : (tensor<!tf.resource<tensor<100x768xf32>>>) -> tensor<100x768xf32>
%93 = "mhlo.torch_index_select"(%92, %91) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<100x768xf32>, tensor<512xi32>) -> tensor<512x768xf32>
%94 = "mhlo.reshape"(%93) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
%95 = chlo.broadcast_add %94, %54 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%96 = "mhlo.reshape"(%arg1) : (tensor<1x512xi32>) -> tensor<1x1x512xi32>
%97 = "mhlo.convert"(%96) : (tensor<1x1x512xi32>) -> tensor<1x1x512xf32>
%98 = chlo.broadcast_multiply %97, %10 : (tensor<1x1x512xf32>, tensor<1x512x1xf32>) -> tensor<1x512x512xf32>
%99 = "mhlo.reshape"(%98) : (tensor<1x512x512xf32>) -> tensor<1x1x512x512xf32>
%100 = chlo.broadcast_subtract %23, %99 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x1x512x512xf32>) -> tensor<1x1x512x512xf32>
%101 = chlo.broadcast_multiply %100, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x512x512xf32>, tensor<f32>) -> tensor<1x1x512x512xf32>
%102 = "mhlo.reshape"(%arg2) : (tensor<1x512xi32>) -> tensor<512xi32>
%103 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<16xi32>
%104 = "mhlo.broadcast_in_dim"(%103) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<16xi32>) -> tensor<512x16xi32>
%105 = "mhlo.broadcast_in_dim"(%102) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<512xi32>) -> tensor<512x16xi32>
%106 = "mhlo.compare"(%105, %104) {comparison_direction = "EQ"} : (tensor<512x16xi32>, tensor<512x16xi32>) -> tensor<512x16xi1>
%107 = "mhlo.broadcast"(%23) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
%108 = "mhlo.broadcast"(%24) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
%109 = "mhlo.select"(%106, %107, %108) : (tensor<512x16xi1>, tensor<512x16xf32>, tensor<512x16xf32>) -> tensor<512x16xf32>
%110 = "mhlo.dot"(%109, %87) : (tensor<512x16xf32>, tensor<16x768xf32>) -> tensor<512x768xf32>
%111 = "mhlo.reshape"(%110) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
%112 = chlo.broadcast_add %95, %111 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%113 = "mhlo.reduce"(%112, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%114 = chlo.broadcast_divide %113, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%115 = "mhlo.reshape"(%114) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%116 = chlo.broadcast_subtract %112, %115 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%117 = chlo.broadcast_multiply %116, %116 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%118 = "mhlo.reduce"(%117, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%119 = chlo.broadcast_divide %118, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%120 = "mhlo.reshape"(%119) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%121 = chlo.broadcast_add %120, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%122 = "mhlo.rsqrt"(%121) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%123 = chlo.broadcast_multiply %122, %49 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%124 = chlo.broadcast_multiply %112, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%125 = chlo.broadcast_multiply %115, %123 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%126 = chlo.broadcast_subtract %50, %125 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%127 = chlo.broadcast_add %124, %126 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%128 = chlo.broadcast_multiply %127, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%129 = chlo.broadcast_multiply %128, %27 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%130 = "mhlo.einsum"(%129, %64) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%131 = chlo.broadcast_add %130, %63 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%132 = "mhlo.einsum"(%129, %68) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%133 = chlo.broadcast_add %132, %67 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%134 = chlo.broadcast_multiply %133, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
%135 = "mhlo.einsum"(%131, %134) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
%136 = chlo.broadcast_add %135, %101 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
%137 = "mhlo.reduce"(%136, %22) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%138 = "mhlo.broadcast_in_dim"(%137) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%139 = mhlo.subtract %136, %138 : tensor<1x12x512x512xf32>
%140 = "mhlo.exponential"(%139) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%141 = "mhlo.reduce"(%140, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%142 = "mhlo.broadcast_in_dim"(%141) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%143 = mhlo.divide %140, %142 : tensor<1x12x512x512xf32>
%144 = chlo.broadcast_multiply %143, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
%145 = chlo.broadcast_multiply %144, %36 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%146 = "mhlo.einsum"(%129, %70) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%147 = chlo.broadcast_add %146, %69 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%148 = "mhlo.einsum"(%145, %147) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%149 = "mhlo.einsum"(%148, %62) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
%150 = chlo.broadcast_add %149, %61 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%151 = chlo.broadcast_multiply %150, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%152 = chlo.broadcast_multiply %151, %30 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%153 = chlo.broadcast_add %129, %152 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%154 = "mhlo.reduce"(%153, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%155 = chlo.broadcast_divide %154, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%156 = "mhlo.reshape"(%155) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%157 = chlo.broadcast_subtract %153, %156 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%158 = chlo.broadcast_multiply %157, %157 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%159 = "mhlo.reduce"(%158, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%160 = chlo.broadcast_divide %159, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%161 = "mhlo.reshape"(%160) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%162 = chlo.broadcast_add %161, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%163 = "mhlo.rsqrt"(%162) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%164 = chlo.broadcast_multiply %163, %65 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%165 = chlo.broadcast_multiply %153, %164 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%166 = chlo.broadcast_multiply %156, %164 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%167 = chlo.broadcast_subtract %66, %166 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%168 = chlo.broadcast_add %165, %167 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%169 = "mhlo.einsum"(%168, %56) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
%170 = chlo.broadcast_add %169, %55 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
%171 = chlo.broadcast_power %170, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%172 = chlo.broadcast_multiply %171, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%173 = chlo.broadcast_add %170, %172 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%174 = chlo.broadcast_multiply %173, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%175 = "mhlo.tanh"(%174) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%176 = chlo.broadcast_add %175, %23 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%177 = chlo.broadcast_multiply %170, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%178 = chlo.broadcast_multiply %177, %176 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%179 = "mhlo.einsum"(%178, %58) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
%180 = chlo.broadcast_add %179, %57 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%181 = chlo.broadcast_multiply %180, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%182 = chlo.broadcast_multiply %181, %33 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%183 = chlo.broadcast_add %182, %168 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%184 = "mhlo.reduce"(%183, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%185 = chlo.broadcast_divide %184, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%186 = "mhlo.reshape"(%185) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%187 = chlo.broadcast_subtract %183, %186 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%188 = chlo.broadcast_multiply %187, %187 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%189 = "mhlo.reduce"(%188, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%190 = chlo.broadcast_divide %189, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%191 = "mhlo.reshape"(%190) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%192 = chlo.broadcast_add %191, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%193 = "mhlo.rsqrt"(%192) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%194 = chlo.broadcast_multiply %193, %59 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%195 = chlo.broadcast_multiply %183, %194 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%196 = chlo.broadcast_multiply %186, %194 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%197 = chlo.broadcast_subtract %60, %196 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%198 = chlo.broadcast_add %195, %197 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%199 = "mhlo.einsum"(%198, %80) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%200 = chlo.broadcast_add %199, %79 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%201 = "mhlo.einsum"(%198, %84) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%202 = chlo.broadcast_add %201, %83 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%203 = chlo.broadcast_multiply %202, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
%204 = "mhlo.einsum"(%200, %203) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
%205 = chlo.broadcast_add %204, %101 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
%206 = "mhlo.reduce"(%205, %22) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%207 = "mhlo.broadcast_in_dim"(%206) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%208 = mhlo.subtract %205, %207 : tensor<1x12x512x512xf32>
%209 = "mhlo.exponential"(%208) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%210 = "mhlo.reduce"(%209, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%211 = "mhlo.broadcast_in_dim"(%210) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%212 = mhlo.divide %209, %211 : tensor<1x12x512x512xf32>
%213 = chlo.broadcast_multiply %212, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
%214 = chlo.broadcast_multiply %213, %45 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%215 = "mhlo.einsum"(%198, %86) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%216 = chlo.broadcast_add %215, %85 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%217 = "mhlo.einsum"(%214, %216) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%218 = "mhlo.einsum"(%217, %78) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
%219 = chlo.broadcast_add %218, %77 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%220 = chlo.broadcast_multiply %219, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%221 = chlo.broadcast_multiply %220, %39 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%222 = chlo.broadcast_add %198, %221 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%223 = "mhlo.reduce"(%222, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%224 = chlo.broadcast_divide %223, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%225 = "mhlo.reshape"(%224) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%226 = chlo.broadcast_subtract %222, %225 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%227 = chlo.broadcast_multiply %226, %226 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%228 = "mhlo.reduce"(%227, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%229 = chlo.broadcast_divide %228, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%230 = "mhlo.reshape"(%229) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%231 = chlo.broadcast_add %230, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%232 = "mhlo.rsqrt"(%231) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%233 = chlo.broadcast_multiply %232, %81 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%234 = chlo.broadcast_multiply %222, %233 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%235 = chlo.broadcast_multiply %225, %233 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%236 = chlo.broadcast_subtract %82, %235 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%237 = chlo.broadcast_add %234, %236 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%238 = "mhlo.einsum"(%237, %72) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
%239 = chlo.broadcast_add %238, %71 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
%240 = chlo.broadcast_power %239, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%241 = chlo.broadcast_multiply %240, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%242 = chlo.broadcast_add %239, %241 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%243 = chlo.broadcast_multiply %242, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%244 = "mhlo.tanh"(%243) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%245 = chlo.broadcast_add %244, %23 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%246 = chlo.broadcast_multiply %239, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%247 = chlo.broadcast_multiply %246, %245 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%248 = "mhlo.einsum"(%247, %74) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
%249 = chlo.broadcast_add %248, %73 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%250 = chlo.broadcast_multiply %249, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%251 = chlo.broadcast_multiply %250, %42 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%252 = chlo.broadcast_add %251, %237 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%253 = "mhlo.reduce"(%252, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%254 = chlo.broadcast_divide %253, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%255 = "mhlo.reshape"(%254) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%256 = chlo.broadcast_subtract %252, %255 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%257 = chlo.broadcast_multiply %256, %256 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%258 = "mhlo.reduce"(%257, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%259 = chlo.broadcast_divide %258, %21 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%260 = "mhlo.reshape"(%259) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%261 = chlo.broadcast_add %260, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%262 = "mhlo.rsqrt"(%261) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%263 = chlo.broadcast_multiply %262, %75 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%264 = chlo.broadcast_multiply %252, %263 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%265 = chlo.broadcast_multiply %255, %263 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%266 = chlo.broadcast_subtract %76, %265 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%267 = chlo.broadcast_add %264, %266 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%268 = "mhlo.slice"(%267) {limit_indices = dense<[1, 1, 768]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x512x768xf32>) -> tensor<1x1x768xf32>
%269 = "mhlo.reshape"(%268) : (tensor<1x1x768xf32>) -> tensor<1x768xf32>
%270 = "mhlo.dot"(%269, %52) : (tensor<1x768xf32>, tensor<768x768xf32>) -> tensor<1x768xf32>
%271 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<768xf32>) -> tensor<1x768xf32>
%272 = mhlo.add %270, %271 : tensor<1x768xf32>
%273 = "mhlo.tanh"(%272) : (tensor<1x768xf32>) -> tensor<1x768xf32>
%274 = chlo.broadcast_multiply %273, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x768xf32>, tensor<f32>) -> tensor<1x768xf32>
%275 = chlo.broadcast_multiply %274, %48 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
%276 = "mhlo.dot"(%275, %89) : (tensor<1x768xf32>, tensor<768x5xf32>) -> tensor<1x5xf32>
%277 = "mhlo.broadcast_in_dim"(%88) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<1x5xf32>
%278 = mhlo.add %276, %277 : tensor<1x5xf32>
%279 = chlo.broadcast_power %239, %90 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%280 = chlo.broadcast_power %170, %90 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%281 = "mhlo.convert"(%arg3) : (tensor<1xi32>) -> tensor<1xf32>
%282 = "mhlo.convert"(%281) : (tensor<1xf32>) -> tensor<1xi64>
%283 = "tf.ReadVariableOp"(%arg45) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
%284 = chlo.broadcast_subtract %23, %2 : (tensor<f32>, tensor<f32>) -> tensor<f32>
%285 = chlo.broadcast_compare %278, %284 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "LE"} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xi1>
%286 = chlo.broadcast_minimum %278, %284 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xf32>
%287 = chlo.broadcast_compare %286, %2 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xi1>
%288 = chlo.broadcast_maximum %286, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xf32>
%289 = "mhlo.log"(%288) : (tensor<1x5xf32>) -> tensor<1x5xf32>
%290 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5xi64>
%291 = "mhlo.broadcast_in_dim"(%290) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xi64>) -> tensor<1x5xi64>
%292 = "mhlo.broadcast_in_dim"(%282) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xi64>) -> tensor<1x5xi64>
%293 = "mhlo.compare"(%292, %291) {comparison_direction = "EQ"} : (tensor<1x5xi64>, tensor<1x5xi64>) -> tensor<1x5xi1>
%294 = "mhlo.broadcast"(%23) {broadcast_sizes = dense<[1, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<1x5xf32>
%295 = "mhlo.broadcast"(%24) {broadcast_sizes = dense<[1, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<1x5xf32>
%296 = "mhlo.select"(%293, %294, %295) : (tensor<1x5xi1>, tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%297 = chlo.broadcast_compare %14, %282 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "LE"} : (tensor<i64>, tensor<1xi64>) -> tensor<1xi1>
%298 = chlo.broadcast_compare %282, %15 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "LT"} : (tensor<1xi64>, tensor<i64>) -> tensor<1xi1>
%299 = chlo.broadcast_and %297, %298 : (tensor<1xi1>, tensor<1xi1>) -> tensor<1xi1>
%300 = chlo.broadcast_select %299, %24, %16 : (tensor<1xi1>, tensor<f32>, tensor<f32>) -> tensor<1xf32>
%301 = "mhlo.reshape"(%300) : (tensor<1xf32>) -> tensor<1x1xf32>
%302 = chlo.broadcast_add %296, %301 : (tensor<1x5xf32>, tensor<1x1xf32>) -> tensor<1x5xf32>
%303 = "mhlo.reduce"(%289, %22) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
%304 = "mhlo.broadcast_in_dim"(%303) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
%305 = mhlo.subtract %289, %304 : tensor<1x5xf32>
%306 = "mhlo.exponential"(%305) : (tensor<1x5xf32>) -> tensor<1x5xf32>
%307 = "mhlo.reduce"(%306, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
%308 = "mhlo.log"(%307) : (tensor<1xf32>) -> tensor<1xf32>
%309 = "mhlo.broadcast_in_dim"(%308) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
%310 = mhlo.subtract %305, %309 : tensor<1x5xf32>
%311 = "mhlo.negate"(%302) : (tensor<1x5xf32>) -> tensor<1x5xf32>
%312 = chlo.broadcast_compare %311, %24 {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1x5xi1>
%313 = chlo.broadcast_multiply %310, %311 : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%314 = chlo.broadcast_select %312, %24, %313 : (tensor<1x5xi1>, tensor<f32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%315 = "mhlo.reduce"(%314, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
%316 = "mhlo.reduce"(%289, %22) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.maximum %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
%317 = "mhlo.broadcast_in_dim"(%316) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
%318 = mhlo.subtract %289, %317 : tensor<1x5xf32>
%319 = "mhlo.exponential"(%318) : (tensor<1x5xf32>) -> tensor<1x5xf32>
%320 = "mhlo.reduce"(%319, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<1xf32>
%321 = "mhlo.broadcast_in_dim"(%320) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x5xf32>
%322 = mhlo.divide %319, %321 : tensor<1x5xf32>
%323 = chlo.broadcast_subtract %322, %302 : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%324 = "mhlo.reduce"(%315, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>, tensor<f32>) -> tensor<f32>
%325 = chlo.broadcast_compare %23, %24 {comparison_direction = "EQ"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
%326 = chlo.broadcast_divide %23, %23 : (tensor<f32>, tensor<f32>) -> tensor<f32>
%327 = chlo.broadcast_select %325, %24, %326 : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
%328 = "mhlo.reshape"(%327) : (tensor<f32>) -> tensor<1x1xf32>
%329 = chlo.broadcast_multiply %328, %323 : (tensor<1x1xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%330 = chlo.broadcast_divide %23, %288 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%331 = chlo.broadcast_multiply %329, %330 : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%332 = chlo.broadcast_select %287, %331, %3 : (tensor<1x5xi1>, tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%333 = chlo.broadcast_select %285, %332, %3 : (tensor<1x5xi1>, tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<1x5xf32>
%334 = "mhlo.reduce"(%333, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1x5xf32>, tensor<f32>) -> tensor<5xf32>
%335 = chlo.broadcast_multiply %283, %334 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<5xf32>) -> tensor<5xf32>
%336 = "tf.ReadVariableOp"(%arg44) : (tensor<!tf.resource<tensor<5xf32>>>) -> tensor<5xf32>
%337 = chlo.broadcast_subtract %336, %335 : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
"tf.AssignVariableOp"(%arg44, %337) : (tensor<!tf.resource<tensor<5xf32>>>, tensor<5xf32>) -> ()
%338 = "mhlo.transpose"(%89) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<768x5xf32>) -> tensor<5x768xf32>
%339 = "mhlo.dot"(%333, %338) : (tensor<1x5xf32>, tensor<5x768xf32>) -> tensor<1x768xf32>
%340 = chlo.broadcast_multiply %339, %48 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
%341 = chlo.broadcast_multiply %340, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x768xf32>, tensor<f32>) -> tensor<1x768xf32>
%342 = chlo.broadcast_multiply %273, %273 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
%343 = chlo.broadcast_subtract %23, %342 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x768xf32>) -> tensor<1x768xf32>
%344 = chlo.broadcast_multiply %341, %343 : (tensor<1x768xf32>, tensor<1x768xf32>) -> tensor<1x768xf32>
%345 = "mhlo.reduce"(%344, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<0> : tensor<1xi64>} : (tensor<1x768xf32>, tensor<f32>) -> tensor<768xf32>
%346 = chlo.broadcast_multiply %283, %345 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%347 = "tf.ReadVariableOp"(%arg42) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%348 = chlo.broadcast_subtract %347, %346 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg42, %348) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%349 = "mhlo.transpose"(%52) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<768x768xf32>) -> tensor<768x768xf32>
%350 = "mhlo.dot"(%344, %349) : (tensor<1x768xf32>, tensor<768x768xf32>) -> tensor<1x768xf32>
%351 = "mhlo.reshape"(%350) : (tensor<1x768xf32>) -> tensor<1x1x768xf32>
%352 = "mhlo.pad"(%351, %24) {edge_padding_high = dense<[0, 511, 0]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x1x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%353 = chlo.broadcast_multiply %352, %263 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%354 = chlo.broadcast_multiply %352, %252 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%355 = "mhlo.negate"(%352) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%356 = chlo.broadcast_multiply %355, %263 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%357 = "mhlo.reduce"(%356, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%358 = "mhlo.reshape"(%357) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%359 = "mhlo.broadcast_in_dim"(%358) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%360 = chlo.broadcast_divide %23, %21 : (tensor<f32>, tensor<f32>) -> tensor<f32>
%361 = chlo.broadcast_multiply %359, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%362 = chlo.broadcast_multiply %255, %355 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%363 = chlo.broadcast_add %354, %362 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%364 = chlo.broadcast_multiply %363, %75 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%365 = "mhlo.reduce"(%364, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%366 = "mhlo.reshape"(%365) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%367 = chlo.broadcast_multiply %262, %262 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%368 = chlo.broadcast_multiply %367, %262 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%369 = chlo.broadcast_divide %366, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%370 = chlo.broadcast_multiply %368, %369 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%371 = "mhlo.broadcast_in_dim"(%370) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%372 = chlo.broadcast_multiply %371, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%373 = chlo.broadcast_multiply %372, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%374 = chlo.broadcast_subtract %252, %255 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%375 = chlo.broadcast_multiply %373, %374 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%376 = chlo.broadcast_add %353, %375 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%377 = chlo.broadcast_add %376, %361 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%378 = chlo.broadcast_multiply %377, %42 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%379 = chlo.broadcast_multiply %378, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%380 = "mhlo.reduce"(%379, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%381 = chlo.broadcast_multiply %283, %380 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%382 = "tf.ReadVariableOp"(%arg38) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%383 = chlo.broadcast_subtract %382, %381 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg38, %383) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%384 = "mhlo.einsum"(%379, %74) {einsum_config = "abd,cd->abc"} : (tensor<1x512x768xf32>, tensor<3072x768xf32>) -> tensor<1x512x3072xf32>
%385 = chlo.broadcast_multiply %384, %245 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%386 = chlo.broadcast_multiply %385, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%387 = chlo.broadcast_multiply %384, %246 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%388 = chlo.broadcast_multiply %244, %244 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%389 = chlo.broadcast_subtract %23, %388 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%390 = chlo.broadcast_multiply %387, %389 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%391 = chlo.broadcast_multiply %390, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%392 = chlo.broadcast_multiply %391, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%393 = chlo.broadcast_multiply %392, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%394 = chlo.broadcast_multiply %393, %279 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%395 = chlo.broadcast_add %386, %391 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%396 = chlo.broadcast_add %395, %394 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%397 = "mhlo.reduce"(%396, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<3072xf32>
%398 = chlo.broadcast_multiply %283, %397 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072xf32>) -> tensor<3072xf32>
%399 = "tf.ReadVariableOp"(%arg36) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
%400 = chlo.broadcast_subtract %399, %398 : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<3072xf32>
"tf.AssignVariableOp"(%arg36, %400) : (tensor<!tf.resource<tensor<3072xf32>>>, tensor<3072xf32>) -> ()
%401 = "mhlo.einsum"(%396, %72) {einsum_config = "abd,cd->abc"} : (tensor<1x512x3072xf32>, tensor<768x3072xf32>) -> tensor<1x512x768xf32>
%402 = chlo.broadcast_add %377, %401 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%403 = chlo.broadcast_multiply %402, %233 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%404 = chlo.broadcast_multiply %402, %222 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%405 = "mhlo.negate"(%402) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%406 = chlo.broadcast_multiply %405, %233 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%407 = "mhlo.reduce"(%406, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%408 = "mhlo.reshape"(%407) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%409 = "mhlo.broadcast_in_dim"(%408) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%410 = chlo.broadcast_multiply %409, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%411 = chlo.broadcast_multiply %225, %405 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%412 = chlo.broadcast_add %404, %411 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%413 = chlo.broadcast_multiply %412, %81 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%414 = "mhlo.reduce"(%413, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%415 = "mhlo.reshape"(%414) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%416 = chlo.broadcast_multiply %232, %232 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%417 = chlo.broadcast_multiply %416, %232 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%418 = chlo.broadcast_divide %415, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%419 = chlo.broadcast_multiply %417, %418 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%420 = "mhlo.broadcast_in_dim"(%419) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%421 = chlo.broadcast_multiply %420, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%422 = chlo.broadcast_multiply %421, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%423 = chlo.broadcast_subtract %222, %225 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%424 = chlo.broadcast_multiply %422, %423 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%425 = chlo.broadcast_add %403, %424 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%426 = chlo.broadcast_add %425, %410 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%427 = chlo.broadcast_multiply %426, %39 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%428 = chlo.broadcast_multiply %427, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%429 = "mhlo.reduce"(%428, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%430 = chlo.broadcast_multiply %283, %429 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%431 = "tf.ReadVariableOp"(%arg32) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%432 = chlo.broadcast_subtract %431, %430 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg32, %432) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%433 = "mhlo.einsum"(%428, %78) {einsum_config = "abe,cde->abcd"} : (tensor<1x512x768xf32>, tensor<12x64x768xf32>) -> tensor<1x512x12x64xf32>
%434 = "mhlo.einsum"(%433, %216) {einsum_config = "abcd,aecd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
%435 = chlo.broadcast_multiply %434, %45 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%436 = chlo.broadcast_multiply %435, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
%437 = chlo.broadcast_multiply %436, %212 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%438 = "mhlo.reduce"(%437, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%439 = "mhlo.reshape"(%438) : (tensor<1x12x512xf32>) -> tensor<1x12x512x1xf32>
%440 = chlo.broadcast_subtract %436, %439 : (tensor<1x12x512x512xf32>, tensor<1x12x512x1xf32>) -> tensor<1x12x512x512xf32>
%441 = chlo.broadcast_multiply %440, %212 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%442 = "mhlo.einsum"(%441, %203) {einsum_config = "acbe,abcd->aecd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%443 = "mhlo.reduce"(%442, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
%444 = chlo.broadcast_multiply %283, %443 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
%445 = "tf.ReadVariableOp"(%arg28) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%446 = chlo.broadcast_subtract %445, %444 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
"tf.AssignVariableOp"(%arg28, %446) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
%447 = "mhlo.einsum"(%442, %80) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
%448 = "mhlo.einsum"(%442, %198) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
%449 = chlo.broadcast_multiply %283, %448 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
%450 = "tf.ReadVariableOp"(%arg27) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%451 = chlo.broadcast_subtract %450, %449 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
"tf.AssignVariableOp"(%arg27, %451) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
%452 = "mhlo.einsum"(%441, %200) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%453 = chlo.broadcast_multiply %452, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
%454 = "mhlo.reduce"(%453, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
%455 = chlo.broadcast_multiply %283, %454 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
%456 = "tf.ReadVariableOp"(%arg26) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%457 = chlo.broadcast_subtract %456, %455 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
"tf.AssignVariableOp"(%arg26, %457) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
%458 = "mhlo.einsum"(%453, %84) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
%459 = "mhlo.einsum"(%453, %198) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
%460 = chlo.broadcast_multiply %283, %459 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
%461 = "tf.ReadVariableOp"(%arg25) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%462 = chlo.broadcast_subtract %461, %460 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
"tf.AssignVariableOp"(%arg25, %462) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
%463 = "mhlo.einsum"(%433, %214) {einsum_config = "abcd,acbe->aecd"} : (tensor<1x512x12x64xf32>, tensor<1x12x512x512xf32>) -> tensor<1x512x12x64xf32>
%464 = "mhlo.reduce"(%463, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
%465 = chlo.broadcast_multiply %283, %464 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
%466 = "tf.ReadVariableOp"(%arg30) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%467 = chlo.broadcast_subtract %466, %465 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
"tf.AssignVariableOp"(%arg30, %467) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
%468 = "mhlo.einsum"(%463, %86) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
%469 = chlo.broadcast_add %426, %468 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%470 = chlo.broadcast_add %447, %458 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%471 = chlo.broadcast_add %469, %470 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%472 = chlo.broadcast_multiply %471, %194 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%473 = chlo.broadcast_multiply %471, %183 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%474 = "mhlo.negate"(%471) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%475 = chlo.broadcast_multiply %474, %194 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%476 = "mhlo.reduce"(%475, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%477 = "mhlo.reshape"(%476) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%478 = "mhlo.broadcast_in_dim"(%477) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%479 = chlo.broadcast_multiply %478, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%480 = chlo.broadcast_multiply %186, %474 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%481 = chlo.broadcast_add %473, %480 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%482 = chlo.broadcast_multiply %481, %59 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%483 = "mhlo.reduce"(%482, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%484 = "mhlo.reshape"(%483) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%485 = chlo.broadcast_multiply %193, %193 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%486 = chlo.broadcast_multiply %485, %193 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%487 = chlo.broadcast_divide %484, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%488 = chlo.broadcast_multiply %486, %487 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%489 = "mhlo.broadcast_in_dim"(%488) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%490 = chlo.broadcast_multiply %489, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%491 = chlo.broadcast_multiply %490, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%492 = chlo.broadcast_subtract %183, %186 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%493 = chlo.broadcast_multiply %491, %492 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%494 = chlo.broadcast_add %472, %493 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%495 = chlo.broadcast_add %494, %479 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%496 = chlo.broadcast_multiply %495, %33 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%497 = chlo.broadcast_multiply %496, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%498 = "mhlo.reduce"(%497, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%499 = chlo.broadcast_multiply %283, %498 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%500 = "tf.ReadVariableOp"(%arg22) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%501 = chlo.broadcast_subtract %500, %499 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg22, %501) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%502 = "mhlo.einsum"(%497, %58) {einsum_config = "abd,cd->abc"} : (tensor<1x512x768xf32>, tensor<3072x768xf32>) -> tensor<1x512x3072xf32>
%503 = chlo.broadcast_multiply %502, %176 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%504 = chlo.broadcast_multiply %503, %7 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%505 = chlo.broadcast_multiply %502, %177 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%506 = chlo.broadcast_multiply %175, %175 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%507 = chlo.broadcast_subtract %23, %506 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%508 = chlo.broadcast_multiply %505, %507 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%509 = chlo.broadcast_multiply %508, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%510 = chlo.broadcast_multiply %509, %9 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%511 = chlo.broadcast_multiply %510, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%512 = chlo.broadcast_multiply %511, %280 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%513 = chlo.broadcast_add %504, %509 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%514 = chlo.broadcast_add %513, %512 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%515 = "mhlo.reduce"(%514, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<3072xf32>
%516 = chlo.broadcast_multiply %283, %515 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072xf32>) -> tensor<3072xf32>
%517 = "tf.ReadVariableOp"(%arg20) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
%518 = chlo.broadcast_subtract %517, %516 : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<3072xf32>
"tf.AssignVariableOp"(%arg20, %518) : (tensor<!tf.resource<tensor<3072xf32>>>, tensor<3072xf32>) -> ()
%519 = "mhlo.einsum"(%514, %56) {einsum_config = "abd,cd->abc"} : (tensor<1x512x3072xf32>, tensor<768x3072xf32>) -> tensor<1x512x768xf32>
%520 = chlo.broadcast_add %495, %519 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%521 = chlo.broadcast_multiply %520, %164 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%522 = chlo.broadcast_multiply %520, %153 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%523 = "mhlo.negate"(%520) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%524 = chlo.broadcast_multiply %523, %164 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%525 = "mhlo.reduce"(%524, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%526 = "mhlo.reshape"(%525) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%527 = "mhlo.broadcast_in_dim"(%526) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%528 = chlo.broadcast_multiply %527, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%529 = chlo.broadcast_multiply %156, %523 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%530 = chlo.broadcast_add %522, %529 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%531 = chlo.broadcast_multiply %530, %65 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%532 = "mhlo.reduce"(%531, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%533 = "mhlo.reshape"(%532) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%534 = chlo.broadcast_multiply %163, %163 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%535 = chlo.broadcast_multiply %534, %163 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%536 = chlo.broadcast_divide %533, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%537 = chlo.broadcast_multiply %535, %536 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%538 = "mhlo.broadcast_in_dim"(%537) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%539 = chlo.broadcast_multiply %538, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%540 = chlo.broadcast_multiply %539, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%541 = chlo.broadcast_subtract %153, %156 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%542 = chlo.broadcast_multiply %540, %541 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%543 = chlo.broadcast_add %521, %542 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%544 = chlo.broadcast_add %543, %528 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%545 = chlo.broadcast_multiply %544, %30 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%546 = chlo.broadcast_multiply %545, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%547 = "mhlo.reduce"(%546, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%548 = chlo.broadcast_multiply %283, %547 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%549 = "tf.ReadVariableOp"(%arg16) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%550 = chlo.broadcast_subtract %549, %548 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg16, %550) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%551 = "mhlo.einsum"(%546, %62) {einsum_config = "abe,cde->abcd"} : (tensor<1x512x768xf32>, tensor<12x64x768xf32>) -> tensor<1x512x12x64xf32>
%552 = "mhlo.einsum"(%551, %147) {einsum_config = "abcd,aecd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
%553 = chlo.broadcast_multiply %552, %36 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%554 = chlo.broadcast_multiply %553, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512x512xf32>
%555 = chlo.broadcast_multiply %554, %143 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%556 = "mhlo.reduce"(%555, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%557 = "mhlo.reshape"(%556) : (tensor<1x12x512xf32>) -> tensor<1x12x512x1xf32>
%558 = chlo.broadcast_subtract %554, %557 : (tensor<1x12x512x512xf32>, tensor<1x12x512x1xf32>) -> tensor<1x12x512x512xf32>
%559 = chlo.broadcast_multiply %558, %143 : (tensor<1x12x512x512xf32>, tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%560 = "mhlo.einsum"(%559, %134) {einsum_config = "acbe,abcd->aecd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%561 = "mhlo.reduce"(%560, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
%562 = chlo.broadcast_multiply %283, %561 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
%563 = "tf.ReadVariableOp"(%arg12) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%564 = chlo.broadcast_subtract %563, %562 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
"tf.AssignVariableOp"(%arg12, %564) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
%565 = "mhlo.einsum"(%560, %64) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
%566 = "mhlo.einsum"(%560, %129) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
%567 = chlo.broadcast_multiply %283, %566 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
%568 = "tf.ReadVariableOp"(%arg11) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%569 = chlo.broadcast_subtract %568, %567 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
"tf.AssignVariableOp"(%arg11, %569) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
%570 = "mhlo.einsum"(%559, %131) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%571 = chlo.broadcast_multiply %570, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
%572 = "mhlo.reduce"(%571, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
%573 = chlo.broadcast_multiply %283, %572 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
%574 = "tf.ReadVariableOp"(%arg10) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%575 = chlo.broadcast_subtract %574, %573 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
"tf.AssignVariableOp"(%arg10, %575) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
%576 = "mhlo.einsum"(%571, %68) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
%577 = "mhlo.einsum"(%571, %129) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
%578 = chlo.broadcast_multiply %283, %577 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
%579 = "tf.ReadVariableOp"(%arg9) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%580 = chlo.broadcast_subtract %579, %578 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
"tf.AssignVariableOp"(%arg9, %580) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
%581 = "mhlo.einsum"(%551, %145) {einsum_config = "abcd,acbe->aecd"} : (tensor<1x512x12x64xf32>, tensor<1x12x512x512xf32>) -> tensor<1x512x12x64xf32>
%582 = "mhlo.reduce"(%581, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<12x64xf32>
%583 = chlo.broadcast_multiply %283, %582 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64xf32>) -> tensor<12x64xf32>
%584 = "tf.ReadVariableOp"(%arg14) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%585 = chlo.broadcast_subtract %584, %583 : (tensor<12x64xf32>, tensor<12x64xf32>) -> tensor<12x64xf32>
"tf.AssignVariableOp"(%arg14, %585) : (tensor<!tf.resource<tensor<12x64xf32>>>, tensor<12x64xf32>) -> ()
%586 = "mhlo.einsum"(%581, %70) {einsum_config = "abde,cde->abc"} : (tensor<1x512x12x64xf32>, tensor<768x12x64xf32>) -> tensor<1x512x768xf32>
%587 = chlo.broadcast_add %544, %586 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%588 = chlo.broadcast_add %565, %576 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%589 = chlo.broadcast_add %587, %588 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%590 = chlo.broadcast_multiply %589, %27 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%591 = chlo.broadcast_multiply %590, %13 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%592 = chlo.broadcast_multiply %591, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%593 = chlo.broadcast_multiply %591, %112 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%594 = "mhlo.negate"(%591) : (tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%595 = chlo.broadcast_multiply %594, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%596 = "mhlo.reduce"(%595, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%597 = "mhlo.reshape"(%596) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%598 = "mhlo.broadcast_in_dim"(%597) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%599 = chlo.broadcast_multiply %598, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%600 = chlo.broadcast_multiply %115, %594 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%601 = chlo.broadcast_add %593, %600 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%602 = chlo.broadcast_multiply %601, %49 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%603 = "mhlo.reduce"(%602, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<512xf32>
%604 = "mhlo.reshape"(%603) : (tensor<512xf32>) -> tensor<1x512x1xf32>
%605 = chlo.broadcast_multiply %122, %122 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%606 = chlo.broadcast_multiply %605, %122 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%607 = chlo.broadcast_divide %604, %17 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%608 = chlo.broadcast_multiply %606, %607 : (tensor<1x512x1xf32>, tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%609 = "mhlo.broadcast_in_dim"(%608) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%610 = chlo.broadcast_multiply %609, %360 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%611 = chlo.broadcast_multiply %610, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512x768xf32>
%612 = chlo.broadcast_subtract %112, %115 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%613 = chlo.broadcast_multiply %611, %612 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%614 = chlo.broadcast_add %592, %613 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%615 = chlo.broadcast_add %614, %599 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%616 = "mhlo.reshape"(%615) : (tensor<1x512x768xf32>) -> tensor<512x768xf32>
%617 = chlo.broadcast_multiply %283, %616 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<512x768xf32>) -> tensor<512x768xf32>
%618 = "tf.ReadVariableOp"(%arg5) : (tensor<!tf.resource<tensor<512x768xf32>>>) -> tensor<512x768xf32>
%619 = chlo.broadcast_subtract %618, %617 : (tensor<512x768xf32>, tensor<512x768xf32>) -> tensor<512x768xf32>
"tf.AssignVariableOp"(%arg5, %619) : (tensor<!tf.resource<tensor<512x768xf32>>>, tensor<512x768xf32>) -> ()
%620 = "mhlo.reshape"(%615) : (tensor<1x512x768xf32>) -> tensor<512x768xf32>
%621 = "mhlo.transpose"(%109) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<512x16xf32>) -> tensor<16x512xf32>
%622 = "mhlo.dot"(%621, %620) : (tensor<16x512xf32>, tensor<512x768xf32>) -> tensor<16x768xf32>
%623 = chlo.broadcast_multiply %283, %622 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<16x768xf32>) -> tensor<16x768xf32>
%624 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf.resource<tensor<16x768xf32>>>) -> tensor<16x768xf32>
%625 = chlo.broadcast_subtract %624, %623 : (tensor<16x768xf32>, tensor<16x768xf32>) -> tensor<16x768xf32>
"tf.AssignVariableOp"(%arg6, %625) : (tensor<!tf.resource<tensor<16x768xf32>>>, tensor<16x768xf32>) -> ()
%626 = "mhlo.negate"(%620) : (tensor<512x768xf32>) -> tensor<512x768xf32>
%627 = chlo.broadcast_multiply %626, %283 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<512x768xf32>, tensor<f32>) -> tensor<512x768xf32>
"tf.ResourceScatterAdd"(%arg4, %91, %627) {_class = ["loc:@bert_classifier/bert_encoder_1/word_embeddings/Gather/resource"], device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf.resource<tensor<100x768xf32>>>, tensor<512xi32>, tensor<512x768xf32>) -> ()
%628 = chlo.broadcast_multiply %122, %601 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%629 = "mhlo.reduce"(%628, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%630 = chlo.broadcast_multiply %283, %629 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%631 = "tf.ReadVariableOp"(%arg7) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%632 = chlo.broadcast_subtract %631, %630 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg7, %632) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%633 = "mhlo.reduce"(%591, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%634 = chlo.broadcast_multiply %283, %633 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%635 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%636 = chlo.broadcast_subtract %635, %634 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg8, %636) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%637 = "mhlo.einsum"(%581, %129) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
%638 = chlo.broadcast_multiply %283, %637 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
%639 = "tf.ReadVariableOp"(%arg13) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%640 = chlo.broadcast_subtract %639, %638 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
"tf.AssignVariableOp"(%arg13, %640) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
%641 = "mhlo.einsum"(%546, %148) {einsum_config = "abe,abcd->cde"} : (tensor<1x512x768xf32>, tensor<1x512x12x64xf32>) -> tensor<12x64x768xf32>
%642 = chlo.broadcast_multiply %283, %641 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
%643 = "tf.ReadVariableOp"(%arg15) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
%644 = chlo.broadcast_subtract %643, %642 : (tensor<12x64x768xf32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
"tf.AssignVariableOp"(%arg15, %644) : (tensor<!tf.resource<tensor<12x64x768xf32>>>, tensor<12x64x768xf32>) -> ()
%645 = chlo.broadcast_multiply %163, %530 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%646 = "mhlo.reduce"(%645, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%647 = chlo.broadcast_multiply %283, %646 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%648 = "tf.ReadVariableOp"(%arg17) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%649 = chlo.broadcast_subtract %648, %647 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg17, %649) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%650 = "mhlo.reduce"(%520, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%651 = chlo.broadcast_multiply %283, %650 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%652 = "tf.ReadVariableOp"(%arg18) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%653 = chlo.broadcast_subtract %652, %651 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg18, %653) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%654 = "mhlo.einsum"(%514, %168) {einsum_config = "abd,abc->cd"} : (tensor<1x512x3072xf32>, tensor<1x512x768xf32>) -> tensor<768x3072xf32>
%655 = chlo.broadcast_multiply %283, %654 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
%656 = "tf.ReadVariableOp"(%arg19) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
%657 = chlo.broadcast_subtract %656, %655 : (tensor<768x3072xf32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
"tf.AssignVariableOp"(%arg19, %657) : (tensor<!tf.resource<tensor<768x3072xf32>>>, tensor<768x3072xf32>) -> ()
%658 = "mhlo.einsum"(%497, %178) {einsum_config = "abd,abc->cd"} : (tensor<1x512x768xf32>, tensor<1x512x3072xf32>) -> tensor<3072x768xf32>
%659 = chlo.broadcast_multiply %283, %658 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
%660 = "tf.ReadVariableOp"(%arg21) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
%661 = chlo.broadcast_subtract %660, %659 : (tensor<3072x768xf32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
"tf.AssignVariableOp"(%arg21, %661) : (tensor<!tf.resource<tensor<3072x768xf32>>>, tensor<3072x768xf32>) -> ()
%662 = chlo.broadcast_multiply %193, %481 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%663 = "mhlo.reduce"(%662, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%664 = chlo.broadcast_multiply %283, %663 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%665 = "tf.ReadVariableOp"(%arg23) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%666 = chlo.broadcast_subtract %665, %664 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg23, %666) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%667 = "mhlo.reduce"(%471, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%668 = chlo.broadcast_multiply %283, %667 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%669 = "tf.ReadVariableOp"(%arg24) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%670 = chlo.broadcast_subtract %669, %668 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg24, %670) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%671 = "mhlo.einsum"(%463, %198) {einsum_config = "abde,abc->cde"} : (tensor<1x512x12x64xf32>, tensor<1x512x768xf32>) -> tensor<768x12x64xf32>
%672 = chlo.broadcast_multiply %283, %671 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
%673 = "tf.ReadVariableOp"(%arg29) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%674 = chlo.broadcast_subtract %673, %672 : (tensor<768x12x64xf32>, tensor<768x12x64xf32>) -> tensor<768x12x64xf32>
"tf.AssignVariableOp"(%arg29, %674) : (tensor<!tf.resource<tensor<768x12x64xf32>>>, tensor<768x12x64xf32>) -> ()
%675 = "mhlo.einsum"(%428, %217) {einsum_config = "abe,abcd->cde"} : (tensor<1x512x768xf32>, tensor<1x512x12x64xf32>) -> tensor<12x64x768xf32>
%676 = chlo.broadcast_multiply %283, %675 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
%677 = "tf.ReadVariableOp"(%arg31) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
%678 = chlo.broadcast_subtract %677, %676 : (tensor<12x64x768xf32>, tensor<12x64x768xf32>) -> tensor<12x64x768xf32>
"tf.AssignVariableOp"(%arg31, %678) : (tensor<!tf.resource<tensor<12x64x768xf32>>>, tensor<12x64x768xf32>) -> ()
%679 = chlo.broadcast_multiply %232, %412 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%680 = "mhlo.reduce"(%679, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%681 = chlo.broadcast_multiply %283, %680 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%682 = "tf.ReadVariableOp"(%arg33) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%683 = chlo.broadcast_subtract %682, %681 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg33, %683) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%684 = "mhlo.reduce"(%402, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%685 = chlo.broadcast_multiply %283, %684 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%686 = "tf.ReadVariableOp"(%arg34) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%687 = chlo.broadcast_subtract %686, %685 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg34, %687) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%688 = "mhlo.einsum"(%396, %237) {einsum_config = "abd,abc->cd"} : (tensor<1x512x3072xf32>, tensor<1x512x768xf32>) -> tensor<768x3072xf32>
%689 = chlo.broadcast_multiply %283, %688 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
%690 = "tf.ReadVariableOp"(%arg35) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
%691 = chlo.broadcast_subtract %690, %689 : (tensor<768x3072xf32>, tensor<768x3072xf32>) -> tensor<768x3072xf32>
"tf.AssignVariableOp"(%arg35, %691) : (tensor<!tf.resource<tensor<768x3072xf32>>>, tensor<768x3072xf32>) -> ()
%692 = "mhlo.einsum"(%379, %247) {einsum_config = "abd,abc->cd"} : (tensor<1x512x768xf32>, tensor<1x512x3072xf32>) -> tensor<3072x768xf32>
%693 = chlo.broadcast_multiply %283, %692 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
%694 = "tf.ReadVariableOp"(%arg37) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
%695 = chlo.broadcast_subtract %694, %693 : (tensor<3072x768xf32>, tensor<3072x768xf32>) -> tensor<3072x768xf32>
"tf.AssignVariableOp"(%arg37, %695) : (tensor<!tf.resource<tensor<3072x768xf32>>>, tensor<3072x768xf32>) -> ()
%696 = chlo.broadcast_multiply %262, %363 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%697 = "mhlo.reduce"(%696, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%698 = chlo.broadcast_multiply %283, %697 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%699 = "tf.ReadVariableOp"(%arg39) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%700 = chlo.broadcast_subtract %699, %698 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg39, %700) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%701 = "mhlo.reduce"(%352, %24) ( {
^bb0(%arg47: tensor<f32>, %arg48: tensor<f32>): // no predecessors
%720 = mhlo.add %arg47, %arg48 : tensor<f32>
"mhlo.return"(%720) : (tensor<f32>) -> ()
}) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<768xf32>
%702 = chlo.broadcast_multiply %283, %701 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
%703 = "tf.ReadVariableOp"(%arg40) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%704 = chlo.broadcast_subtract %703, %702 : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
"tf.AssignVariableOp"(%arg40, %704) : (tensor<!tf.resource<tensor<768xf32>>>, tensor<768xf32>) -> ()
%705 = "mhlo.transpose"(%269) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x768xf32>) -> tensor<768x1xf32>
%706 = "mhlo.dot"(%705, %344) : (tensor<768x1xf32>, tensor<1x768xf32>) -> tensor<768x768xf32>
%707 = chlo.broadcast_multiply %283, %706 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x768xf32>) -> tensor<768x768xf32>
%708 = "tf.ReadVariableOp"(%arg41) : (tensor<!tf.resource<tensor<768x768xf32>>>) -> tensor<768x768xf32>
%709 = chlo.broadcast_subtract %708, %707 : (tensor<768x768xf32>, tensor<768x768xf32>) -> tensor<768x768xf32>
"tf.AssignVariableOp"(%arg41, %709) : (tensor<!tf.resource<tensor<768x768xf32>>>, tensor<768x768xf32>) -> ()
%710 = "mhlo.transpose"(%275) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x768xf32>) -> tensor<768x1xf32>
%711 = "mhlo.dot"(%710, %333) : (tensor<768x1xf32>, tensor<1x5xf32>) -> tensor<768x5xf32>
%712 = chlo.broadcast_multiply %283, %711 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<768x5xf32>) -> tensor<768x5xf32>
%713 = "tf.ReadVariableOp"(%arg43) : (tensor<!tf.resource<tensor<768x5xf32>>>) -> tensor<768x5xf32>
%714 = chlo.broadcast_subtract %713, %712 : (tensor<768x5xf32>, tensor<768x5xf32>) -> tensor<768x5xf32>
"tf.AssignVariableOp"(%arg43, %714) : (tensor<!tf.resource<tensor<768x5xf32>>>, tensor<768x5xf32>) -> ()
%715 = "tf.ReadVariableOp"(%arg46) : (tensor<!tf.resource<tensor<i64>>>) -> tensor<i64>
%716 = chlo.broadcast_add %715, %0 : (tensor<i64>, tensor<i64>) -> tensor<i64>
"tf.AssignVariableOp"(%arg46, %716) : (tensor<!tf.resource<tensor<i64>>>, tensor<i64>) -> ()
%717 = chlo.broadcast_compare %23, %24 {comparison_direction = "EQ"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
%718 = chlo.broadcast_divide %324, %23 : (tensor<f32>, tensor<f32>) -> tensor<f32>
%719 = chlo.broadcast_select %717, %24, %718 : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
return %719 : tensor<f32>
}
func @"__inference_<lambda>_32400"(%arg0: tensor<1x512xi32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0, 0]}, %arg1: tensor<1x512xi32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0, 1]}, %arg2: tensor<1x512xi32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0, 2]}, %arg3: tensor<!tf.resource<tensor<100x768xf32>>> {tf_saved_model.bound_input = @"__sm_node45__m.layer-3.layer-1.embeddings"}, %arg4: tensor<!tf.resource<tensor<512x768xf32>>> {tf_saved_model.bound_input = @"__sm_node46__m.layer-3.layer-3.embeddings"}, %arg5: tensor<!tf.resource<tensor<16x768xf32>>> {tf_saved_model.bound_input = @"__sm_node47__m.layer-3.layer-4.embeddings"}, %arg6: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node48__m.layer-3.layer-6.gamma"}, %arg7: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node49__m.layer-3.layer-6.beta"}, %arg8: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node50__m.layer-3.layer-10._attention_layer._query_dense.kernel"}, %arg9: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node51__m.layer-3.layer-10._attention_layer._query_dense.bias"}, %arg10: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node52__m.layer-3.layer-10._attention_layer._key_dense.kernel"}, %arg11: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node53__m.layer-3.layer-10._attention_layer._key_dense.bias"}, %arg12: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node54__m.layer-3.layer-10._attention_layer._value_dense.kernel"}, %arg13: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node55__m.layer-3.layer-10._attention_layer._value_dense.bias"}, %arg14: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node56__m.layer-3.layer-10._attention_layer._output_dense.kernel"}, %arg15: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node57__m.layer-3.layer-10._attention_layer._output_dense.bias"}, %arg16: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node58__m.layer-3.layer-10.keras_api.layers.2.gamma"}, %arg17: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node59__m.layer-3.layer-10.keras_api.layers.2.beta"}, %arg18: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node60__m.layer-3.layer-10.keras_api.layers.3.kernel"}, %arg19: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node61__m.layer-3.layer-10.keras_api.layers.3.bias"}, %arg20: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node62__m.layer-3.layer-10._output_dense.kernel"}, %arg21: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node63__m.layer-3.layer-10._output_dense.bias"}, %arg22: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node64__m.layer-3.layer-10._output_layer_norm.gamma"}, %arg23: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node65__m.layer-3.layer-10._output_layer_norm.beta"}, %arg24: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node66__m.layer-3.layer-11._attention_layer._query_dense.kernel"}, %arg25: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node67__m.layer-3.layer-11._attention_layer._query_dense.bias"}, %arg26: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node68__m.layer-3.layer-11._attention_layer._key_dense.kernel"}, %arg27: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node69__m.layer-3.layer-11._attention_layer._key_dense.bias"}, %arg28: tensor<!tf.resource<tensor<768x12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node70__m.layer-3.layer-11._attention_layer._value_dense.kernel"}, %arg29: tensor<!tf.resource<tensor<12x64xf32>>> {tf_saved_model.bound_input = @"__sm_node71__m.layer-3.layer-11._attention_layer._value_dense.bias"}, %arg30: tensor<!tf.resource<tensor<12x64x768xf32>>> {tf_saved_model.bound_input = @"__sm_node72__m.layer-3.layer-11._attention_layer._output_dense.kernel"}, %arg31: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node73__m.layer-3.layer-11._attention_layer._output_dense.bias"}, %arg32: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node74__m.layer-3.layer-11.keras_api.layers.2.gamma"}, %arg33: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node75__m.layer-3.layer-11.keras_api.layers.2.beta"}, %arg34: tensor<!tf.resource<tensor<768x3072xf32>>> {tf_saved_model.bound_input = @"__sm_node76__m.layer-3.layer-11.keras_api.layers.3.kernel"}, %arg35: tensor<!tf.resource<tensor<3072xf32>>> {tf_saved_model.bound_input = @"__sm_node77__m.layer-3.layer-11.keras_api.layers.3.bias"}, %arg36: tensor<!tf.resource<tensor<3072x768xf32>>> {tf_saved_model.bound_input = @"__sm_node78__m.layer-3.layer-11._output_dense.kernel"}, %arg37: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node79__m.layer-3.layer-11._output_dense.bias"}, %arg38: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node80__m.layer-3.layer-11._output_layer_norm.gamma"}, %arg39: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node81__m.layer-3.layer-11._output_layer_norm.beta"}, %arg40: tensor<!tf.resource<tensor<768x768xf32>>> {tf_saved_model.bound_input = @"__sm_node82__m.layer-3.layer-13.kernel"}, %arg41: tensor<!tf.resource<tensor<768xf32>>> {tf_saved_model.bound_input = @"__sm_node83__m.layer-3.layer-13.bias"}, %arg42: tensor<!tf.resource<tensor<768x5xf32>>> {tf_saved_model.bound_input = @"__sm_node84__m.layer-5.out_proj.kernel"}, %arg43: tensor<!tf.resource<tensor<5xf32>>> {tf_saved_model.bound_input = @"__sm_node85__m.layer-5.out_proj.bias"}) -> (tensor<1x5xf32> {tf_saved_model.index_path = []}) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<1x512>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful, tf_saved_model.exported_names = ["predict"]} {
%0 = mhlo.constant dense<-1.000000e+09> : tensor<f32>
%1 = mhlo.constant dense<1.250000e-01> : tensor<f32>
%2 = mhlo.constant dense<0.797884583> : tensor<f32>
%3 = mhlo.constant dense<5.000000e-01> : tensor<f32>
%4 = mhlo.constant dense<1.000000e+00> : tensor<f32>
%5 = mhlo.constant dense<3.000000e+00> : tensor<f32>
%6 = mhlo.constant dense<4.471500e-02> : tensor<f32>
%7 = mhlo.constant dense<1.000000e+00> : tensor<1x512x1xf32>
%8 = mhlo.constant dense<9.99999996E-13> : tensor<f32>
%9 = mhlo.constant dense<0xFF800000> : tensor<f32>
%10 = mhlo.constant dense<0.000000e+00> : tensor<f32>
%11 = mhlo.constant dense<7.680000e+02> : tensor<f32>
%12 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%13 = "tf.ReadVariableOp"(%arg7) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%14 = "tf.ReadVariableOp"(%arg41) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%15 = "tf.ReadVariableOp"(%arg40) : (tensor<!tf.resource<tensor<768x768xf32>>>) -> tensor<768x768xf32>
%16 = "tf.ReadVariableOp"(%arg4) : (tensor<!tf.resource<tensor<512x768xf32>>>) -> tensor<512x768xf32>
%17 = "mhlo.reshape"(%16) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
%18 = "tf.ReadVariableOp"(%arg19) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
%19 = "tf.ReadVariableOp"(%arg18) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
%20 = "tf.ReadVariableOp"(%arg21) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%21 = "tf.ReadVariableOp"(%arg20) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
%22 = "tf.ReadVariableOp"(%arg22) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%23 = "tf.ReadVariableOp"(%arg23) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%24 = "tf.ReadVariableOp"(%arg15) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%25 = "tf.ReadVariableOp"(%arg14) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
%26 = "tf.ReadVariableOp"(%arg11) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%27 = "tf.ReadVariableOp"(%arg10) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%28 = "tf.ReadVariableOp"(%arg16) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%29 = "tf.ReadVariableOp"(%arg17) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%30 = "tf.ReadVariableOp"(%arg9) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%31 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%32 = "tf.ReadVariableOp"(%arg13) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%33 = "tf.ReadVariableOp"(%arg12) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%34 = "tf.ReadVariableOp"(%arg35) : (tensor<!tf.resource<tensor<3072xf32>>>) -> tensor<3072xf32>
%35 = "tf.ReadVariableOp"(%arg34) : (tensor<!tf.resource<tensor<768x3072xf32>>>) -> tensor<768x3072xf32>
%36 = "tf.ReadVariableOp"(%arg37) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%37 = "tf.ReadVariableOp"(%arg36) : (tensor<!tf.resource<tensor<3072x768xf32>>>) -> tensor<3072x768xf32>
%38 = "tf.ReadVariableOp"(%arg38) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%39 = "tf.ReadVariableOp"(%arg39) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%40 = "tf.ReadVariableOp"(%arg31) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%41 = "tf.ReadVariableOp"(%arg30) : (tensor<!tf.resource<tensor<12x64x768xf32>>>) -> tensor<12x64x768xf32>
%42 = "tf.ReadVariableOp"(%arg27) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%43 = "tf.ReadVariableOp"(%arg26) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%44 = "tf.ReadVariableOp"(%arg32) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%45 = "tf.ReadVariableOp"(%arg33) : (tensor<!tf.resource<tensor<768xf32>>>) -> tensor<768xf32>
%46 = "tf.ReadVariableOp"(%arg25) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%47 = "tf.ReadVariableOp"(%arg24) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%48 = "tf.ReadVariableOp"(%arg29) : (tensor<!tf.resource<tensor<12x64xf32>>>) -> tensor<12x64xf32>
%49 = "tf.ReadVariableOp"(%arg28) : (tensor<!tf.resource<tensor<768x12x64xf32>>>) -> tensor<768x12x64xf32>
%50 = "tf.ReadVariableOp"(%arg5) : (tensor<!tf.resource<tensor<16x768xf32>>>) -> tensor<16x768xf32>
%51 = "tf.ReadVariableOp"(%arg43) : (tensor<!tf.resource<tensor<5xf32>>>) -> tensor<5xf32>
%52 = "tf.ReadVariableOp"(%arg42) : (tensor<!tf.resource<tensor<768x5xf32>>>) -> tensor<768x5xf32>
%53 = "mhlo.reshape"(%arg0) : (tensor<1x512xi32>) -> tensor<512xi32>
%54 = "tf.ReadVariableOp"(%arg3) : (tensor<!tf.resource<tensor<100x768xf32>>>) -> tensor<100x768xf32>
%55 = "mhlo.torch_index_select"(%54, %53) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<100x768xf32>, tensor<512xi32>) -> tensor<512x768xf32>
%56 = "mhlo.reshape"(%55) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
%57 = chlo.broadcast_add %56, %17 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%58 = "mhlo.reshape"(%arg1) : (tensor<1x512xi32>) -> tensor<1x1x512xi32>
%59 = "mhlo.convert"(%58) : (tensor<1x1x512xi32>) -> tensor<1x1x512xf32>
%60 = chlo.broadcast_multiply %59, %7 : (tensor<1x1x512xf32>, tensor<1x512x1xf32>) -> tensor<1x512x512xf32>
%61 = "mhlo.reshape"(%60) : (tensor<1x512x512xf32>) -> tensor<1x1x512x512xf32>
%62 = chlo.broadcast_subtract %4, %61 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1x1x512x512xf32>) -> tensor<1x1x512x512xf32>
%63 = chlo.broadcast_multiply %62, %0 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x1x512x512xf32>, tensor<f32>) -> tensor<1x1x512x512xf32>
%64 = "mhlo.reshape"(%arg2) : (tensor<1x512xi32>) -> tensor<512xi32>
%65 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<16xi32>
%66 = "mhlo.broadcast_in_dim"(%65) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<16xi32>) -> tensor<512x16xi32>
%67 = "mhlo.broadcast_in_dim"(%64) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<512xi32>) -> tensor<512x16xi32>
%68 = "mhlo.compare"(%67, %66) {comparison_direction = "EQ"} : (tensor<512x16xi32>, tensor<512x16xi32>) -> tensor<512x16xi1>
%69 = "mhlo.broadcast"(%4) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
%70 = "mhlo.broadcast"(%10) {broadcast_sizes = dense<[512, 16]> : tensor<2xi64>} : (tensor<f32>) -> tensor<512x16xf32>
%71 = "mhlo.select"(%68, %69, %70) : (tensor<512x16xi1>, tensor<512x16xf32>, tensor<512x16xf32>) -> tensor<512x16xf32>
%72 = "mhlo.dot"(%71, %50) : (tensor<512x16xf32>, tensor<16x768xf32>) -> tensor<512x768xf32>
%73 = "mhlo.reshape"(%72) : (tensor<512x768xf32>) -> tensor<1x512x768xf32>
%74 = chlo.broadcast_add %57, %73 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%75 = "mhlo.reduce"(%74, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%76 = chlo.broadcast_divide %75, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%77 = "mhlo.reshape"(%76) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%78 = chlo.broadcast_subtract %74, %77 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%79 = chlo.broadcast_multiply %78, %78 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%80 = "mhlo.reduce"(%79, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%81 = chlo.broadcast_divide %80, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%82 = "mhlo.reshape"(%81) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%83 = chlo.broadcast_add %82, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%84 = "mhlo.rsqrt"(%83) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%85 = chlo.broadcast_multiply %84, %12 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%86 = chlo.broadcast_multiply %74, %85 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%87 = chlo.broadcast_multiply %77, %85 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%88 = chlo.broadcast_subtract %13, %87 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%89 = chlo.broadcast_add %86, %88 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%90 = "mhlo.einsum"(%89, %27) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%91 = chlo.broadcast_add %90, %26 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%92 = "mhlo.einsum"(%89, %31) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%93 = chlo.broadcast_add %92, %30 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%94 = chlo.broadcast_multiply %93, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
%95 = "mhlo.einsum"(%91, %94) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
%96 = chlo.broadcast_add %95, %63 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
%97 = "mhlo.reduce"(%96, %9) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.maximum %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%98 = "mhlo.broadcast_in_dim"(%97) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%99 = mhlo.subtract %96, %98 : tensor<1x12x512x512xf32>
%100 = "mhlo.exponential"(%99) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%101 = "mhlo.reduce"(%100, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%102 = "mhlo.broadcast_in_dim"(%101) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%103 = mhlo.divide %100, %102 : tensor<1x12x512x512xf32>
%104 = "mhlo.einsum"(%89, %33) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%105 = chlo.broadcast_add %104, %32 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%106 = "mhlo.einsum"(%103, %105) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%107 = "mhlo.einsum"(%106, %25) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
%108 = chlo.broadcast_add %107, %24 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%109 = chlo.broadcast_add %89, %108 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%110 = "mhlo.reduce"(%109, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%111 = chlo.broadcast_divide %110, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%112 = "mhlo.reshape"(%111) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%113 = chlo.broadcast_subtract %109, %112 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%114 = chlo.broadcast_multiply %113, %113 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%115 = "mhlo.reduce"(%114, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%116 = chlo.broadcast_divide %115, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%117 = "mhlo.reshape"(%116) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%118 = chlo.broadcast_add %117, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%119 = "mhlo.rsqrt"(%118) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%120 = chlo.broadcast_multiply %119, %28 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%121 = chlo.broadcast_multiply %109, %120 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%122 = chlo.broadcast_multiply %112, %120 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%123 = chlo.broadcast_subtract %29, %122 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%124 = chlo.broadcast_add %121, %123 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%125 = "mhlo.einsum"(%124, %19) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
%126 = chlo.broadcast_add %125, %18 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
%127 = chlo.broadcast_power %126, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%128 = chlo.broadcast_multiply %127, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%129 = chlo.broadcast_add %126, %128 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%130 = chlo.broadcast_multiply %129, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%131 = "mhlo.tanh"(%130) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%132 = chlo.broadcast_add %131, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%133 = chlo.broadcast_multiply %126, %3 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%134 = chlo.broadcast_multiply %133, %132 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%135 = "mhlo.einsum"(%134, %21) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
%136 = chlo.broadcast_add %135, %20 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%137 = chlo.broadcast_add %136, %124 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%138 = "mhlo.reduce"(%137, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%139 = chlo.broadcast_divide %138, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%140 = "mhlo.reshape"(%139) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%141 = chlo.broadcast_subtract %137, %140 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%142 = chlo.broadcast_multiply %141, %141 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%143 = "mhlo.reduce"(%142, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%144 = chlo.broadcast_divide %143, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%145 = "mhlo.reshape"(%144) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%146 = chlo.broadcast_add %145, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%147 = "mhlo.rsqrt"(%146) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%148 = chlo.broadcast_multiply %147, %22 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%149 = chlo.broadcast_multiply %137, %148 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%150 = chlo.broadcast_multiply %140, %148 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%151 = chlo.broadcast_subtract %23, %150 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%152 = chlo.broadcast_add %149, %151 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%153 = "mhlo.einsum"(%152, %43) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%154 = chlo.broadcast_add %153, %42 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%155 = "mhlo.einsum"(%152, %47) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%156 = chlo.broadcast_add %155, %46 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%157 = chlo.broadcast_multiply %156, %1 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x12x64xf32>, tensor<f32>) -> tensor<1x512x12x64xf32>
%158 = "mhlo.einsum"(%154, %157) {einsum_config = "aecd,abcd->acbe"} : (tensor<1x512x12x64xf32>, tensor<1x512x12x64xf32>) -> tensor<1x12x512x512xf32>
%159 = chlo.broadcast_add %158, %63 : (tensor<1x12x512x512xf32>, tensor<1x1x512x512xf32>) -> tensor<1x12x512x512xf32>
%160 = "mhlo.reduce"(%159, %9) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.maximum %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%161 = "mhlo.broadcast_in_dim"(%160) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%162 = mhlo.subtract %159, %161 : tensor<1x12x512x512xf32>
%163 = "mhlo.exponential"(%162) : (tensor<1x12x512x512xf32>) -> tensor<1x12x512x512xf32>
%164 = "mhlo.reduce"(%163, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<3> : tensor<1xi64>} : (tensor<1x12x512x512xf32>, tensor<f32>) -> tensor<1x12x512xf32>
%165 = "mhlo.broadcast_in_dim"(%164) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x12x512xf32>) -> tensor<1x12x512x512xf32>
%166 = mhlo.divide %163, %165 : tensor<1x12x512x512xf32>
%167 = "mhlo.einsum"(%152, %49) {einsum_config = "abc,cde->abde"} : (tensor<1x512x768xf32>, tensor<768x12x64xf32>) -> tensor<1x512x12x64xf32>
%168 = chlo.broadcast_add %167, %48 {broadcast_dimensions = dense<[2, 3]> : tensor<2xi64>} : (tensor<1x512x12x64xf32>, tensor<12x64xf32>) -> tensor<1x512x12x64xf32>
%169 = "mhlo.einsum"(%166, %168) {einsum_config = "acbe,aecd->abcd"} : (tensor<1x12x512x512xf32>, tensor<1x512x12x64xf32>) -> tensor<1x512x12x64xf32>
%170 = "mhlo.einsum"(%169, %41) {einsum_config = "abcd,cde->abe"} : (tensor<1x512x12x64xf32>, tensor<12x64x768xf32>) -> tensor<1x512x768xf32>
%171 = chlo.broadcast_add %170, %40 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%172 = chlo.broadcast_add %152, %171 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%173 = "mhlo.reduce"(%172, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%174 = chlo.broadcast_divide %173, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%175 = "mhlo.reshape"(%174) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%176 = chlo.broadcast_subtract %172, %175 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%177 = chlo.broadcast_multiply %176, %176 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%178 = "mhlo.reduce"(%177, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%179 = chlo.broadcast_divide %178, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%180 = "mhlo.reshape"(%179) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%181 = chlo.broadcast_add %180, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%182 = "mhlo.rsqrt"(%181) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%183 = chlo.broadcast_multiply %182, %44 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%184 = chlo.broadcast_multiply %172, %183 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%185 = chlo.broadcast_multiply %175, %183 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%186 = chlo.broadcast_subtract %45, %185 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%187 = chlo.broadcast_add %184, %186 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%188 = "mhlo.einsum"(%187, %35) {einsum_config = "abc,cd->abd"} : (tensor<1x512x768xf32>, tensor<768x3072xf32>) -> tensor<1x512x3072xf32>
%189 = chlo.broadcast_add %188, %34 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x3072xf32>, tensor<3072xf32>) -> tensor<1x512x3072xf32>
%190 = chlo.broadcast_power %189, %5 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%191 = chlo.broadcast_multiply %190, %6 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%192 = chlo.broadcast_add %189, %191 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%193 = chlo.broadcast_multiply %192, %2 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%194 = "mhlo.tanh"(%193) : (tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%195 = chlo.broadcast_add %194, %4 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%196 = chlo.broadcast_multiply %189, %3 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x3072xf32>, tensor<f32>) -> tensor<1x512x3072xf32>
%197 = chlo.broadcast_multiply %196, %195 : (tensor<1x512x3072xf32>, tensor<1x512x3072xf32>) -> tensor<1x512x3072xf32>
%198 = "mhlo.einsum"(%197, %37) {einsum_config = "abc,cd->abd"} : (tensor<1x512x3072xf32>, tensor<3072x768xf32>) -> tensor<1x512x768xf32>
%199 = chlo.broadcast_add %198, %36 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%200 = chlo.broadcast_add %199, %187 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%201 = "mhlo.reduce"(%200, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%202 = chlo.broadcast_divide %201, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%203 = "mhlo.reshape"(%202) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%204 = chlo.broadcast_subtract %200, %203 : (tensor<1x512x768xf32>, tensor<1x512x1xf32>) -> tensor<1x512x768xf32>
%205 = chlo.broadcast_multiply %204, %204 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%206 = "mhlo.reduce"(%205, %10) ( {
^bb0(%arg44: tensor<f32>, %arg45: tensor<f32>): // no predecessors
%225 = mhlo.add %arg44, %arg45 : tensor<f32>
"mhlo.return"(%225) : (tensor<f32>) -> ()
}) {dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x768xf32>, tensor<f32>) -> tensor<1x512xf32>
%207 = chlo.broadcast_divide %206, %11 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512xf32>, tensor<f32>) -> tensor<1x512xf32>
%208 = "mhlo.reshape"(%207) : (tensor<1x512xf32>) -> tensor<1x512x1xf32>
%209 = chlo.broadcast_add %208, %8 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<1x512x1xf32>, tensor<f32>) -> tensor<1x512x1xf32>
%210 = "mhlo.rsqrt"(%209) : (tensor<1x512x1xf32>) -> tensor<1x512x1xf32>
%211 = chlo.broadcast_multiply %210, %38 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x512x1xf32>, tensor<768xf32>) -> tensor<1x512x768xf32>
%212 = chlo.broadcast_multiply %200, %211 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%213 = chlo.broadcast_multiply %203, %211 : (tensor<1x512x1xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%214 = chlo.broadcast_subtract %39, %213 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%215 = chlo.broadcast_add %212, %214 : (tensor<1x512x768xf32>, tensor<1x512x768xf32>) -> tensor<1x512x768xf32>
%216 = "mhlo.slice"(%215) {limit_indices = dense<[1, 1, 768]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x512x768xf32>) -> tensor<1x1x768xf32>
%217 = "mhlo.reshape"(%216) : (tensor<1x1x768xf32>) -> tensor<1x768xf32>
%218 = "mhlo.dot"(%217, %15) : (tensor<1x768xf32>, tensor<768x768xf32>) -> tensor<1x768xf32>
%219 = "mhlo.broadcast_in_dim"(%14) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<768xf32>) -> tensor<1x768xf32>
%220 = mhlo.add %218, %219 : tensor<1x768xf32>
%221 = "mhlo.tanh"(%220) : (tensor<1x768xf32>) -> tensor<1x768xf32>
%222 = "mhlo.dot"(%221, %52) : (tensor<1x768xf32>, tensor<768x5xf32>) -> tensor<1x5xf32>
%223 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<1x5xf32>
%224 = mhlo.add %222, %223 : tensor<1x5xf32>
return %224 : tensor<1x5xf32>
}
}
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:725:0: error: could not lower resource op to flow: tf.ResourceScatterAdd
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:678:0: note: called from
/usr/local/google/home/laurenzo/src/ModelCompiler/nlp_gen/bert_gen.py:61:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:983:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3983:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py:668:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:1007:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3291:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3456:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/eager/function.py:3109:0: note: called from
/usr/local/google/home/laurenzo/src/iree/iree.venv/lib/python3.9/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:725:0: note: see current operation: "tf.ResourceScatterAdd"(%0, %134, %670) {_class = ["loc:@bert_classifier/bert_encoder_1/word_embeddings/Gather/resource"], device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (!iree.ptr<tensor<100x768xf32>>, tensor<512xi32>, tensor<512x768xf32>) -> ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment