benvanik/unidirectional_lstm.mlir

## unidirectional_lstm.mlir
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After ShapeToShapeLowering //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<1.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %8 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %9 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %115 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%115) : (tensor<f32>) -> ()
  }
  %10 = "mhlo.compare"(%9, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %11 = mhlo.convert(%10) : (tensor<5xi1>) -> tensor<5xi32>
  %12 = mhlo.multiply %11, %cst_0 : tensor<5xi32>
  %13 = mhlo.reduce(%12 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %115 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%115) : (tensor<i32>) -> ()
  }
  %14 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %15 = "mhlo.compare"(%14, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %16 = "mhlo.reverse"(%9) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %17 = "mhlo.compare"(%16, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %18 = mhlo.convert(%17) : (tensor<5xi1>) -> tensor<5xi32>
  %19 = mhlo.multiply %18, %cst_0 : tensor<5xi32>
  %20 = mhlo.reduce(%19 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %115 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%115) : (tensor<i32>) -> ()
  }
  %21 = mhlo.subtract %cst_2, %20 : tensor<i32>
  %22 = "mhlo.select"(%15, %cst_6, %21) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %23 = mhlo.convert(%22) : (tensor<i32>) -> tensor<i64>
  %24 = mhlo.subtract %cst_2, %14 : tensor<i32>
  %25 = mhlo.convert(%24) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%23, %25, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%26: tensor<i64>, %27: tensor<i64>, %28: tensor<40xf32>, %29: tensor<i64>, %30: tensor<74x40xf32>, %31: tensor<i64>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x64xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5xi64>, %38: tensor<5x1x10xf32>, %39: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %40 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
  %41 = tensor.extract %40[] : tensor<i1>
  cf.cond_br %41, ^bb2(%26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%26, %31, %32, %33, %37, %38, %39 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%42: tensor<i64>, %43: tensor<i64>, %44: tensor<40xf32>, %45: tensor<i64>, %46: tensor<74x40xf32>, %47: tensor<i64>, %48: tensor<1x10xf32>, %49: tensor<1x10xf32>, %50: tensor<5x1x64xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5x1x1xf32>, %53: tensor<5xi64>, %54: tensor<5x1x10xf32>, %55: tensor<5x1x10xf32>):  // pred: ^bb1
  %56 = mhlo.add %42, %cst_5 : tensor<i64>
  %57 = "mhlo.gather"(%51, %42) {dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [0], start_index_map = [0]>, slice_sizes = dense<1> : tensor<3xi64>} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %58 = "mhlo.reshape"(%57) : (tensor<1x1xf32>) -> tensor<1xf32>
  %59 = "mhlo.broadcast_in_dim"(%58) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %60 = mhlo.multiply %59, %6 : tensor<1x10xf32>
  %61 = "mhlo.compare"(%60, %7) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %62 = "mhlo.gather"(%50, %42) {dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [0], start_index_map = [0]>, slice_sizes = dense<[1, 1, 64]> : tensor<3xi64>} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %63 = "mhlo.concatenate"(%62, %49) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %64 = "mhlo.dot"(%63, %46) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %65 = "mhlo.reshape"(%44) : (tensor<40xf32>) -> tensor<1x40xf32>
  %66 = mhlo.add %64, %65 : tensor<1x40xf32>
  %67 = "mhlo.slice"(%66) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %68 = mhlo.multiply %67, %8 : tensor<1x10xf32>
  %69 = mhlo.tanh %68 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %8 : tensor<1x10xf32>
  %71 = mhlo.add %70, %8 : tensor<1x10xf32>
  %72 = mhlo.multiply %71, %48 : tensor<1x10xf32>
  %73 = "mhlo.slice"(%66) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %74 = mhlo.multiply %73, %8 : tensor<1x10xf32>
  %75 = mhlo.tanh %74 : tensor<1x10xf32>
  %76 = mhlo.multiply %75, %8 : tensor<1x10xf32>
  %77 = mhlo.add %76, %8 : tensor<1x10xf32>
  %78 = "mhlo.slice"(%66) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %79 = mhlo.tanh %78 : tensor<1x10xf32>
  %80 = mhlo.multiply %77, %79 : tensor<1x10xf32>
  %81 = mhlo.add %72, %80 : tensor<1x10xf32>
  %82 = mhlo.minimum %81, %4 : tensor<1x10xf32>
  %83 = mhlo.maximum %82, %5 : tensor<1x10xf32>
  %84 = "mhlo.select"(%61, %48, %83) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %85 = "mhlo.reshape"(%57) : (tensor<1x1xf32>) -> tensor<1xf32>
  %86 = "mhlo.broadcast_in_dim"(%85) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %87 = mhlo.multiply %86, %6 : tensor<1x10xf32>
  %88 = "mhlo.compare"(%87, %7) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %89 = "mhlo.slice"(%66) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %90 = mhlo.multiply %89, %8 : tensor<1x10xf32>
  %91 = mhlo.tanh %90 : tensor<1x10xf32>
  %92 = mhlo.multiply %91, %8 : tensor<1x10xf32>
  %93 = mhlo.add %92, %8 : tensor<1x10xf32>
  %94 = mhlo.tanh %83 : tensor<1x10xf32>
  %95 = mhlo.multiply %93, %94 : tensor<1x10xf32>
  %96 = "mhlo.select"(%88, %49, %95) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %97 = "mhlo.reshape"(%47) : (tensor<i64>) -> tensor<1xi64>
  %98 = "mhlo.reshape"(%42) : (tensor<i64>) -> tensor<1xi64>
  %99 = mhlo.convert(%98) : (tensor<1xi64>) -> tensor<1xi32>
  %100 = "mhlo.reshape"(%99) : (tensor<1xi32>) -> tensor<i32>
  %101 = "mhlo.dynamic-update-slice"(%53, %97, %100) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %102 = "mhlo.reshape"(%84) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %103 = "mhlo.reshape"(%99) : (tensor<1xi32>) -> tensor<i32>
  %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %105 = "mhlo.reshape"(%96) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %106 = "mhlo.reshape"(%99) : (tensor<1xi32>) -> tensor<i32>
  %107 = "mhlo.dynamic-update-slice"(%55, %105, %106, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%56, %43, %44, %45, %46, %47, %84, %96, %50, %51, %52, %101, %104, %107 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%108: tensor<i64>, %109: tensor<i64>, %110: tensor<1x10xf32>, %111: tensor<1x10xf32>, %112: tensor<5xi64>, %113: tensor<5x1x10xf32>, %114: tensor<5x1x10xf32>):  // pred: ^bb1
  return %108, %112, %113, %114, %109, %110, %111 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%112) : (tensor<f32>) -> ()
  }
  %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
  %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
  %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
  %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
  %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
  %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
  %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
  %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>):  // pred: ^bb1
  %55 = mhlo.add %41, %cst_5 : tensor<i64>
  %56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
  %64 = mhlo.add %62, %63 : tensor<1x40xf32>
  %65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
  %67 = mhlo.tanh %66 : tensor<1x10xf32>
  %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
  %69 = mhlo.add %68, %7 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
  %71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
  %73 = mhlo.tanh %72 : tensor<1x10xf32>
  %74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
  %75 = mhlo.add %74, %7 : tensor<1x10xf32>
  %76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %77 = mhlo.tanh %76 : tensor<1x10xf32>
  %78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
  %79 = mhlo.add %70, %78 : tensor<1x10xf32>
  %80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
  %81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
  %82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
  %88 = mhlo.tanh %87 : tensor<1x10xf32>
  %89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
  %90 = mhlo.add %89, %7 : tensor<1x10xf32>
  %91 = mhlo.tanh %81 : tensor<1x10xf32>
  %92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
  %93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
  %95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
  %96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
  %97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>):  // pred: ^bb1
  return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%112) : (tensor<f32>) -> ()
  }
  %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
  %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
  %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
  %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
  %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
  %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
  %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
  %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>):  // pred: ^bb1
  %55 = mhlo.add %41, %cst_5 : tensor<i64>
  %56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
  %64 = mhlo.add %62, %63 : tensor<1x40xf32>
  %65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
  %67 = mhlo.tanh %66 : tensor<1x10xf32>
  %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
  %69 = mhlo.add %68, %7 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
  %71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
  %73 = mhlo.tanh %72 : tensor<1x10xf32>
  %74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
  %75 = mhlo.add %74, %7 : tensor<1x10xf32>
  %76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %77 = mhlo.tanh %76 : tensor<1x10xf32>
  %78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
  %79 = mhlo.add %70, %78 : tensor<1x10xf32>
  %80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
  %81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
  %82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
  %88 = mhlo.tanh %87 : tensor<1x10xf32>
  %89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
  %90 = mhlo.add %89, %7 : tensor<1x10xf32>
  %91 = mhlo.tanh %81 : tensor<1x10xf32>
  %92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
  %93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
  %95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
  %96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
  %97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>):  // pred: ^bb1
  return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After ShapeToShapeLowering //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%112) : (tensor<f32>) -> ()
  }
  %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
  %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
  %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
  %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
  %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
  %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
  %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
  %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>):  // pred: ^bb1
  %55 = mhlo.add %41, %cst_5 : tensor<i64>
  %56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
  %64 = mhlo.add %62, %63 : tensor<1x40xf32>
  %65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
  %67 = mhlo.tanh %66 : tensor<1x10xf32>
  %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
  %69 = mhlo.add %68, %7 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
  %71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
  %73 = mhlo.tanh %72 : tensor<1x10xf32>
  %74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
  %75 = mhlo.add %74, %7 : tensor<1x10xf32>
  %76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %77 = mhlo.tanh %76 : tensor<1x10xf32>
  %78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
  %79 = mhlo.add %70, %78 : tensor<1x10xf32>
  %80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
  %81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
  %82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
  %88 = mhlo.tanh %87 : tensor<1x10xf32>
  %89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
  %90 = mhlo.add %89, %7 : tensor<1x10xf32>
  %91 = mhlo.tanh %81 : tensor<1x10xf32>
  %92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
  %93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
  %95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
  %96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
  %97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>):  // pred: ^bb1
  return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst = arith.constant dense<0.000000e+00> : tensor<f32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
  %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
  return %7#3 : tensor<5x1x10xf32>
}

// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst = arith.constant dense<0.000000e+00> : tensor<f32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
  %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
  return %7#3 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst = arith.constant dense<0.000000e+00> : tensor<f32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
  %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
  return %7#3 : tensor<5x1x10xf32>
}

// -----// IR Dump After ShapeToShapeLowering //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst = arith.constant dense<0.000000e+00> : tensor<f32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
  %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
  return %7#3 : tensor<5x1x10xf32>
}

// -----// IR Dump After ConvertShapeToStandard //----- //
module {
  func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
    %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
    return %0 : tensor<i1>
  }
  func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
    %cst = arith.constant dense<0x7F800000> : tensor<f32>
    %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
    %cst_2 = arith.constant dense<5> : tensor<i32>
    %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_4 = arith.constant dense<0> : tensor<i64>
    %2 = mhlo.constant dense<0> : tensor<5xi64>
    %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i64>
    %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
    %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
    %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<0> : tensor<i32>
    %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
     reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
      %112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
      "mhlo.return"(%112) : (tensor<f32>) -> ()
    }
    %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
    %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
    %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
      %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
      "mhlo.return"(%112) : (tensor<i32>) -> ()
    }
    %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
    %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
    %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
    %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
    %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
    %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
      %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
      "mhlo.return"(%112) : (tensor<i32>) -> ()
    }
    %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
    %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
    %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
    %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
    %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
    cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
  ^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
    %40 = tensor.extract %39[] : tensor<i1>
    cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
  ^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>):  // pred: ^bb1
    %55 = mhlo.add %41, %cst_5 : tensor<i64>
    %56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
    %57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
    %58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
    %61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
    %62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
    %63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
    %64 = mhlo.add %62, %63 : tensor<1x40xf32>
    %65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
    %67 = mhlo.tanh %66 : tensor<1x10xf32>
    %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
    %69 = mhlo.add %68, %7 : tensor<1x10xf32>
    %70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
    %71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
    %73 = mhlo.tanh %72 : tensor<1x10xf32>
    %74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
    %75 = mhlo.add %74, %7 : tensor<1x10xf32>
    %76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %77 = mhlo.tanh %76 : tensor<1x10xf32>
    %78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
    %79 = mhlo.add %70, %78 : tensor<1x10xf32>
    %80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
    %81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
    %82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
    %84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
    %88 = mhlo.tanh %87 : tensor<1x10xf32>
    %89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
    %90 = mhlo.add %89, %7 : tensor<1x10xf32>
    %91 = mhlo.tanh %81 : tensor<1x10xf32>
    %92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
    %93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
    %95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
    %96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
    %97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
    %98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
    %99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
    %100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
    %101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
    %102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
    %103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
    %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
    cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
  ^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>):  // pred: ^bb1
    return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
  }
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
    %cst = arith.constant dense<0.000000e+00> : tensor<f32>
    %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
    %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
    %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
    %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
    %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
    %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
    %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
    return %7#3 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After Canonicalizer //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%112) : (tensor<f32>) -> ()
  }
  %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
  %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
  %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
  %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
  %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
  %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
  %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
  %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>):  // pred: ^bb1
  %55 = mhlo.add %41, %cst_5 : tensor<i64>
  %56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
  %64 = mhlo.add %62, %63 : tensor<1x40xf32>
  %65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
  %67 = mhlo.tanh %66 : tensor<1x10xf32>
  %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
  %69 = mhlo.add %68, %7 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
  %71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
  %73 = mhlo.tanh %72 : tensor<1x10xf32>
  %74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
  %75 = mhlo.add %74, %7 : tensor<1x10xf32>
  %76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %77 = mhlo.tanh %76 : tensor<1x10xf32>
  %78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
  %79 = mhlo.add %70, %78 : tensor<1x10xf32>
  %80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
  %81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
  %82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
  %88 = mhlo.tanh %87 : tensor<1x10xf32>
  %89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
  %90 = mhlo.add %89, %7 : tensor<1x10xf32>
  %91 = mhlo.tanh %81 : tensor<1x10xf32>
  %92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
  %93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
  %95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
  %96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
  %97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>):  // pred: ^bb1
  return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst = arith.constant dense<0.000000e+00> : tensor<f32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
  %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
  return %7#3 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%112) : (tensor<f32>) -> ()
  }
  %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
  %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
  %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
  %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
  %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
  %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%112) : (tensor<i32>) -> ()
  }
  %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
  %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
  %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>):  // pred: ^bb1
  %55 = mhlo.add %41, %cst_5 : tensor<i64>
  %56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
  %64 = mhlo.add %62, %63 : tensor<1x40xf32>
  %65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
  %67 = mhlo.tanh %66 : tensor<1x10xf32>
  %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
  %69 = mhlo.add %68, %7 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
  %71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
  %73 = mhlo.tanh %72 : tensor<1x10xf32>
  %74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
  %75 = mhlo.add %74, %7 : tensor<1x10xf32>
  %76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %77 = mhlo.tanh %76 : tensor<1x10xf32>
  %78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
  %79 = mhlo.add %70, %78 : tensor<1x10xf32>
  %80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
  %81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
  %82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
  %84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
  %88 = mhlo.tanh %87 : tensor<1x10xf32>
  %89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
  %90 = mhlo.add %89, %7 : tensor<1x10xf32>
  %91 = mhlo.tanh %81 : tensor<1x10xf32>
  %92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
  %93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
  %95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
  %96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
  %97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
  %104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>):  // pred: ^bb1
  return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
  %cst = arith.constant dense<0x7F800000> : tensor<f32>
  %0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_1 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_4 = arith.constant dense<0> : tensor<i64>
  %2 = mhlo.constant dense<0> : tensor<5xi64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i64>
  %4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg5: tensor<f32>, %arg6: tensor<f32>)  {
    %108 = mhlo.minimum %arg5, %arg6 : tensor<f32>
    "mhlo.return"(%108) : (tensor<f32>) -> ()
  }
  %9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
  %11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
  %12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %108 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%108) : (tensor<i32>) -> ()
  }
  %13 = mhlo.subtract %cst_2, %12 : tensor<i32>
  %14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
  %18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
  %19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg5: tensor<i32>, %arg6: tensor<i32>)  {
    %108 = mhlo.maximum %arg5, %arg6 : tensor<i32>
    "mhlo.return"(%108) : (tensor<i32>) -> ()
  }
  %20 = mhlo.subtract %cst_2, %19 : tensor<i32>
  %21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
  %23 = mhlo.subtract %cst_2, %13 : tensor<i32>
  %24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%22, %24, %1, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<74x40xf32>, %29: tensor<i64>, %30: tensor<1x10xf32>, %31: tensor<1x10xf32>, %32: tensor<5x1x64xf32>, %33: tensor<5x1x1xf32>, %34: tensor<5xi64>, %35: tensor<5x1x10xf32>, %36: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %37 = "mhlo.compare"(%25, %26) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  %38 = tensor.extract %37[] : tensor<i1>
  cf.cond_br %38, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %29, %30, %31, %34, %35, %36 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%39: tensor<i64>, %40: tensor<i64>, %41: tensor<40xf32>, %42: tensor<74x40xf32>, %43: tensor<i64>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5xi64>, %49: tensor<5x1x10xf32>, %50: tensor<5x1x10xf32>):  // pred: ^bb1
  %51 = mhlo.add %39, %cst_5 : tensor<i64>
  %52 = "mhlo.torch_index_select"(%47, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %53 = "mhlo.reshape"(%52) : (tensor<1x1xf32>) -> tensor<1xf32>
  %54 = "mhlo.broadcast_in_dim"(%53) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %55 = "mhlo.compare"(%54, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %56 = "mhlo.torch_index_select"(%46, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %57 = "mhlo.concatenate"(%56, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %58 = "mhlo.dot"(%57, %42) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %59 = "mhlo.reshape"(%41) : (tensor<40xf32>) -> tensor<1x40xf32>
  %60 = mhlo.add %58, %59 : tensor<1x40xf32>
  %61 = "mhlo.slice"(%60) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %62 = mhlo.multiply %61, %7 : tensor<1x10xf32>
  %63 = mhlo.tanh %62 : tensor<1x10xf32>
  %64 = mhlo.multiply %63, %7 : tensor<1x10xf32>
  %65 = mhlo.add %64, %7 : tensor<1x10xf32>
  %66 = mhlo.multiply %65, %44 : tensor<1x10xf32>
  %67 = "mhlo.slice"(%60) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
  %69 = mhlo.tanh %68 : tensor<1x10xf32>
  %70 = mhlo.multiply %69, %7 : tensor<1x10xf32>
  %71 = mhlo.add %70, %7 : tensor<1x10xf32>
  %72 = "mhlo.slice"(%60) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %73 = mhlo.tanh %72 : tensor<1x10xf32>
  %74 = mhlo.multiply %71, %73 : tensor<1x10xf32>
  %75 = mhlo.add %66, %74 : tensor<1x10xf32>
  %76 = mhlo.minimum %75, %4 : tensor<1x10xf32>
  %77 = mhlo.maximum %76, %5 : tensor<1x10xf32>
  %78 = "mhlo.select"(%55, %44, %77) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %79 = "mhlo.reshape"(%52) : (tensor<1x1xf32>) -> tensor<1xf32>
  %80 = "mhlo.broadcast_in_dim"(%79) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %81 = "mhlo.compare"(%80, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %82 = "mhlo.slice"(%60) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %83 = mhlo.multiply %82, %7 : tensor<1x10xf32>
  %84 = mhlo.tanh %83 : tensor<1x10xf32>
  %85 = mhlo.multiply %84, %7 : tensor<1x10xf32>
  %86 = mhlo.add %85, %7 : tensor<1x10xf32>
  %87 = mhlo.tanh %77 : tensor<1x10xf32>
  %88 = mhlo.multiply %86, %87 : tensor<1x10xf32>
  %89 = "mhlo.select"(%81, %45, %88) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %90 = "mhlo.reshape"(%43) : (tensor<i64>) -> tensor<1xi64>
  %91 = "mhlo.reshape"(%39) : (tensor<i64>) -> tensor<1xi64>
  %92 = mhlo.convert(%91) : (tensor<1xi64>) -> tensor<1xi32>
  %93 = "mhlo.reshape"(%92) : (tensor<1xi32>) -> tensor<i32>
  %94 = "mhlo.dynamic-update-slice"(%48, %90, %93) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
  %95 = "mhlo.reshape"(%78) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %96 = "mhlo.reshape"(%92) : (tensor<1xi32>) -> tensor<i32>
  %97 = "mhlo.dynamic-update-slice"(%49, %95, %96, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  %98 = "mhlo.reshape"(%89) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %99 = "mhlo.reshape"(%92) : (tensor<1xi32>) -> tensor<i32>
  %100 = "mhlo.dynamic-update-slice"(%50, %98, %99, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%51, %40, %41, %42, %43, %78, %89, %46, %47, %94, %97, %100 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%101: tensor<i64>, %102: tensor<i64>, %103: tensor<1x10xf32>, %104: tensor<1x10xf32>, %105: tensor<5xi64>, %106: tensor<5x1x10xf32>, %107: tensor<5x1x10xf32>):  // pred: ^bb1
  return %101, %105, %106, %107, %102, %103, %104 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst = arith.constant dense<0.000000e+00> : tensor<f32>
  %1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
  %2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
  return %7#3 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0> : tensor<i32>
  %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_0 = arith.constant dense<1> : tensor<i64>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
  %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
    %94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
    "mhlo.return"(%94) : (tensor<f32>) -> ()
  }
  %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
  %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
  %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%94) : (tensor<i32>) -> ()
  }
  %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
  %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
  %22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
  %23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%94) : (tensor<i32>) -> ()
  }
  %24 = mhlo.subtract %cst_2, %23 : tensor<i32>
  %25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %26 = mhlo.convert(%25) : (tensor<i32>) -> tensor<i64>
  %27 = mhlo.subtract %cst_2, %17 : tensor<i32>
  %28 = mhlo.convert(%27) : (tensor<i32>) -> tensor<i64>
  cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%29: tensor<i64>, %30: tensor<i64>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  %39 = tensor.extract %38[] : tensor<i1>
  cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
^bb2(%40: tensor<i64>, %41: tensor<i64>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // pred: ^bb1
  %49 = mhlo.add %40, %cst_0 : tensor<i64>
  %50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
  %51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
  %52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
  %55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
  %58 = mhlo.add %56, %57 : tensor<1x40xf32>
  %59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
  %61 = mhlo.tanh %60 : tensor<1x10xf32>
  %62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
  %63 = mhlo.add %62, %0 : tensor<1x10xf32>
  %64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
  %65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
  %67 = mhlo.tanh %66 : tensor<1x10xf32>
  %68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
  %69 = mhlo.add %68, %0 : tensor<1x10xf32>
  %70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %71 = mhlo.tanh %70 : tensor<1x10xf32>
  %72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
  %73 = mhlo.add %64, %72 : tensor<1x10xf32>
  %74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
  %75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
  %76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
  %78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
  %82 = mhlo.tanh %81 : tensor<1x10xf32>
  %83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
  %84 = mhlo.add %83, %0 : tensor<1x10xf32>
  %85 = mhlo.tanh %75 : tensor<1x10xf32>
  %86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
  %87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %88 = "mhlo.reshape"(%40) : (tensor<i64>) -> tensor<1xi64>
  %89 = mhlo.convert(%88) : (tensor<1xi64>) -> tensor<1xi32>
  %90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
  %92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%93: tensor<5x1x10xf32>):  // pred: ^bb1
  return %93 : tensor<5x1x10xf32>
}

// -----// IR Dump After Inliner //----- //
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0> : tensor<i32>
    %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
    %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_0 = arith.constant dense<1> : tensor<i64>
    %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_2 = arith.constant dense<5> : tensor<i32>
    %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
    %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
    %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
    %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
    %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
    %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
    %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
    %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
     reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
      %94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
      "mhlo.return"(%94) : (tensor<f32>) -> ()
    }
    %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
    %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
    %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
      %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
      "mhlo.return"(%94) : (tensor<i32>) -> ()
    }
    %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
    %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
    %19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
    %20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
    %22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
    %23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
      %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
      "mhlo.return"(%94) : (tensor<i32>) -> ()
    }
    %24 = mhlo.subtract %cst_2, %23 : tensor<i32>
    %25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
    %26 = mhlo.convert(%25) : (tensor<i32>) -> tensor<i64>
    %27 = mhlo.subtract %cst_2, %17 : tensor<i32>
    %28 = mhlo.convert(%27) : (tensor<i32>) -> tensor<i64>
    cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%29: tensor<i64>, %30: tensor<i64>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
    %39 = tensor.extract %38[] : tensor<i1>
    cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
  ^bb2(%40: tensor<i64>, %41: tensor<i64>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // pred: ^bb1
    %49 = mhlo.add %40, %cst_0 : tensor<i64>
    %50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
    %51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
    %52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
    %55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
    %56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
    %57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
    %58 = mhlo.add %56, %57 : tensor<1x40xf32>
    %59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
    %61 = mhlo.tanh %60 : tensor<1x10xf32>
    %62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
    %63 = mhlo.add %62, %0 : tensor<1x10xf32>
    %64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
    %65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
    %67 = mhlo.tanh %66 : tensor<1x10xf32>
    %68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
    %69 = mhlo.add %68, %0 : tensor<1x10xf32>
    %70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %71 = mhlo.tanh %70 : tensor<1x10xf32>
    %72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
    %73 = mhlo.add %64, %72 : tensor<1x10xf32>
    %74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
    %75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
    %76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
    %78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
    %82 = mhlo.tanh %81 : tensor<1x10xf32>
    %83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
    %84 = mhlo.add %83, %0 : tensor<1x10xf32>
    %85 = mhlo.tanh %75 : tensor<1x10xf32>
    %86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
    %87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %88 = "mhlo.reshape"(%40) : (tensor<i64>) -> tensor<1xi64>
    %89 = mhlo.convert(%88) : (tensor<1xi64>) -> tensor<1xi32>
    %90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
    %91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
    %92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
    cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%93: tensor<5x1x10xf32>):  // pred: ^bb1
    return %93 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteI64ToI32Pass //----- //
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0> : tensor<i32>
    %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
    %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_0 = arith.constant dense<1> : tensor<i32>
    %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_2 = arith.constant dense<5> : tensor<i32>
    %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
    %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
    %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
    %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
    %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
    %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
    %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
    %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
     reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
      %94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
      "mhlo.return"(%94) : (tensor<f32>) -> ()
    }
    %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
    %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
    %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
      %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
      "mhlo.return"(%94) : (tensor<i32>) -> ()
    }
    %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
    %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
    %19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
    %20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
    %22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
    %23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
      %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
      "mhlo.return"(%94) : (tensor<i32>) -> ()
    }
    %24 = mhlo.subtract %cst_2, %23 : tensor<i32>
    %25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
    %26 = mhlo.convert %25 : tensor<i32>
    %27 = mhlo.subtract %cst_2, %17 : tensor<i32>
    %28 = mhlo.convert %27 : tensor<i32>
    cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%29: tensor<i32>, %30: tensor<i32>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
    %39 = tensor.extract %38[] : tensor<i1>
    cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
  ^bb2(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // pred: ^bb1
    %49 = mhlo.add %40, %cst_0 : tensor<i32>
    %50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
    %51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
    %52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
    %55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
    %56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
    %57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
    %58 = mhlo.add %56, %57 : tensor<1x40xf32>
    %59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
    %61 = mhlo.tanh %60 : tensor<1x10xf32>
    %62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
    %63 = mhlo.add %62, %0 : tensor<1x10xf32>
    %64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
    %65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
    %67 = mhlo.tanh %66 : tensor<1x10xf32>
    %68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
    %69 = mhlo.add %68, %0 : tensor<1x10xf32>
    %70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %71 = mhlo.tanh %70 : tensor<1x10xf32>
    %72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
    %73 = mhlo.add %64, %72 : tensor<1x10xf32>
    %74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
    %75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
    %76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
    %78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
    %82 = mhlo.tanh %81 : tensor<1x10xf32>
    %83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
    %84 = mhlo.add %83, %0 : tensor<1x10xf32>
    %85 = mhlo.tanh %75 : tensor<1x10xf32>
    %86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
    %87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %88 = "mhlo.reshape"(%40) : (tensor<i32>) -> tensor<1xi32>
    %89 = mhlo.convert %88 : tensor<1xi32>
    %90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
    %91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
    %92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
    cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%93: tensor<5x1x10xf32>):  // pred: ^bb1
    return %93 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0> : tensor<i32>
    %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
    %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_0 = arith.constant dense<1> : tensor<i32>
    %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_2 = arith.constant dense<5> : tensor<i32>
    %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
    %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
    %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
    %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
    %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
    %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
    %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
    %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
     reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
      %94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
      "mhlo.return"(%94) : (tensor<f32>) -> ()
    }
    %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
    %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
    %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
      %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
      "mhlo.return"(%94) : (tensor<i32>) -> ()
    }
    %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
    %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
    %19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
    %20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
    %22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
    %23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
     reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
      %94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
      "mhlo.return"(%94) : (tensor<i32>) -> ()
    }
    %24 = mhlo.subtract %cst_2, %23 : tensor<i32>
    %25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
    %26 = mhlo.convert %25 : tensor<i32>
    %27 = mhlo.subtract %cst_2, %17 : tensor<i32>
    %28 = mhlo.convert %27 : tensor<i32>
    cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%29: tensor<i32>, %30: tensor<i32>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
    %39 = tensor.extract %38[] : tensor<i1>
    cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
  ^bb2(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // pred: ^bb1
    %49 = mhlo.add %40, %cst_0 : tensor<i32>
    %50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
    %51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
    %52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
    %55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
    %56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
    %57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
    %58 = mhlo.add %56, %57 : tensor<1x40xf32>
    %59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
    %61 = mhlo.tanh %60 : tensor<1x10xf32>
    %62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
    %63 = mhlo.add %62, %0 : tensor<1x10xf32>
    %64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
    %65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
    %67 = mhlo.tanh %66 : tensor<1x10xf32>
    %68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
    %69 = mhlo.add %68, %0 : tensor<1x10xf32>
    %70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %71 = mhlo.tanh %70 : tensor<1x10xf32>
    %72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
    %73 = mhlo.add %64, %72 : tensor<1x10xf32>
    %74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
    %75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
    %76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
    %78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
    %79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
    %80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
    %81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
    %82 = mhlo.tanh %81 : tensor<1x10xf32>
    %83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
    %84 = mhlo.add %83, %0 : tensor<1x10xf32>
    %85 = mhlo.tanh %75 : tensor<1x10xf32>
    %86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
    %87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
    %88 = "mhlo.reshape"(%40) : (tensor<i32>) -> tensor<1xi32>
    %89 = mhlo.convert %88 : tensor<1xi32>
    %90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
    %91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
    %92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
    cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%93: tensor<5x1x10xf32>):  // pred: ^bb1
    return %93 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0> : tensor<i32>
  %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_0 = arith.constant dense<1> : tensor<i32>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
  %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
    %89 = mhlo.minimum %arg2, %arg3 : tensor<f32>
    "mhlo.return"(%89) : (tensor<f32>) -> ()
  }
  %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
  %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
  %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %89 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%89) : (tensor<i32>) -> ()
  }
  %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
  %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
  %22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
  %23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %89 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%89) : (tensor<i32>) -> ()
  }
  %24 = mhlo.subtract %cst_2, %23 : tensor<i32>
  %25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %26 = mhlo.subtract %cst_2, %17 : tensor<i32>
  cf.br ^bb1(%25, %26, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%27: tensor<i32>, %28: tensor<i32>, %29: tensor<40xf32>, %30: tensor<74x40xf32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %36 = "mhlo.compare"(%27, %28) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %37 = tensor.extract %36[] : tensor<i1>
  cf.cond_br %37, ^bb2(%27, %28, %29, %30, %31, %32, %33, %34, %35 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%35 : tensor<5x1x10xf32>)
^bb2(%38: tensor<i32>, %39: tensor<i32>, %40: tensor<40xf32>, %41: tensor<74x40xf32>, %42: tensor<1x10xf32>, %43: tensor<1x10xf32>, %44: tensor<5x1x64xf32>, %45: tensor<5x1x1xf32>, %46: tensor<5x1x10xf32>):  // pred: ^bb1
  %47 = mhlo.add %38, %cst_0 : tensor<i32>
  %48 = "mhlo.torch_index_select"(%45, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
  %49 = "mhlo.reshape"(%48) : (tensor<1x1xf32>) -> tensor<1xf32>
  %50 = "mhlo.broadcast_in_dim"(%49) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %51 = "mhlo.compare"(%50, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %52 = "mhlo.torch_index_select"(%44, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
  %53 = "mhlo.concatenate"(%52, %43) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %54 = "mhlo.dot"(%53, %41) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %55 = "mhlo.reshape"(%40) : (tensor<40xf32>) -> tensor<1x40xf32>
  %56 = mhlo.add %54, %55 : tensor<1x40xf32>
  %57 = "mhlo.slice"(%56) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %58 = mhlo.multiply %57, %0 : tensor<1x10xf32>
  %59 = mhlo.tanh %58 : tensor<1x10xf32>
  %60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
  %61 = mhlo.add %60, %0 : tensor<1x10xf32>
  %62 = mhlo.multiply %61, %42 : tensor<1x10xf32>
  %63 = "mhlo.slice"(%56) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %64 = mhlo.multiply %63, %0 : tensor<1x10xf32>
  %65 = mhlo.tanh %64 : tensor<1x10xf32>
  %66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
  %67 = mhlo.add %66, %0 : tensor<1x10xf32>
  %68 = "mhlo.slice"(%56) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %69 = mhlo.tanh %68 : tensor<1x10xf32>
  %70 = mhlo.multiply %67, %69 : tensor<1x10xf32>
  %71 = mhlo.add %62, %70 : tensor<1x10xf32>
  %72 = mhlo.minimum %71, %2 : tensor<1x10xf32>
  %73 = mhlo.maximum %72, %1 : tensor<1x10xf32>
  %74 = "mhlo.select"(%51, %42, %73) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %75 = "mhlo.reshape"(%48) : (tensor<1x1xf32>) -> tensor<1xf32>
  %76 = "mhlo.broadcast_in_dim"(%75) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %77 = "mhlo.compare"(%76, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %78 = "mhlo.slice"(%56) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %79 = mhlo.multiply %78, %0 : tensor<1x10xf32>
  %80 = mhlo.tanh %79 : tensor<1x10xf32>
  %81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
  %82 = mhlo.add %81, %0 : tensor<1x10xf32>
  %83 = mhlo.tanh %73 : tensor<1x10xf32>
  %84 = mhlo.multiply %82, %83 : tensor<1x10xf32>
  %85 = "mhlo.select"(%77, %43, %84) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %86 = "mhlo.reshape"(%85) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %87 = "mhlo.dynamic-update-slice"(%46, %86, %38, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%47, %39, %40, %41, %74, %85, %44, %45, %87 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%88: tensor<5x1x10xf32>):  // pred: ^bb1
  return %88 : tensor<5x1x10xf32>
}

// -----// IR Dump After CSE //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0> : tensor<i32>
  %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_0 = arith.constant dense<1> : tensor<i32>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
  %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
    %86 = mhlo.minimum %arg2, %arg3 : tensor<f32>
    "mhlo.return"(%86) : (tensor<f32>) -> ()
  }
  %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
  %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
  %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %86 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%86) : (tensor<i32>) -> ()
  }
  %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
  %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
  %22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
  %23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %86 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%86) : (tensor<i32>) -> ()
  }
  %24 = mhlo.subtract %cst_2, %23 : tensor<i32>
  %25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %26 = mhlo.subtract %cst_2, %17 : tensor<i32>
  cf.br ^bb1(%25, %26, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%27: tensor<i32>, %28: tensor<i32>, %29: tensor<40xf32>, %30: tensor<74x40xf32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %36 = "mhlo.compare"(%27, %28) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %37 = tensor.extract %36[] : tensor<i1>
  cf.cond_br %37, ^bb2(%27, %28, %29, %30, %31, %32, %33, %34, %35 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%35 : tensor<5x1x10xf32>)
^bb2(%38: tensor<i32>, %39: tensor<i32>, %40: tensor<40xf32>, %41: tensor<74x40xf32>, %42: tensor<1x10xf32>, %43: tensor<1x10xf32>, %44: tensor<5x1x64xf32>, %45: tensor<5x1x1xf32>, %46: tensor<5x1x10xf32>):  // pred: ^bb1
  %47 = mhlo.add %38, %cst_0 : tensor<i32>
  %48 = "mhlo.torch_index_select"(%45, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
  %49 = "mhlo.reshape"(%48) : (tensor<1x1xf32>) -> tensor<1xf32>
  %50 = "mhlo.broadcast_in_dim"(%49) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %51 = "mhlo.compare"(%50, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %52 = "mhlo.torch_index_select"(%44, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
  %53 = "mhlo.concatenate"(%52, %43) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %54 = "mhlo.dot"(%53, %41) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %55 = "mhlo.reshape"(%40) : (tensor<40xf32>) -> tensor<1x40xf32>
  %56 = mhlo.add %54, %55 : tensor<1x40xf32>
  %57 = "mhlo.slice"(%56) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %58 = mhlo.multiply %57, %0 : tensor<1x10xf32>
  %59 = mhlo.tanh %58 : tensor<1x10xf32>
  %60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
  %61 = mhlo.add %60, %0 : tensor<1x10xf32>
  %62 = mhlo.multiply %61, %42 : tensor<1x10xf32>
  %63 = "mhlo.slice"(%56) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %64 = mhlo.multiply %63, %0 : tensor<1x10xf32>
  %65 = mhlo.tanh %64 : tensor<1x10xf32>
  %66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
  %67 = mhlo.add %66, %0 : tensor<1x10xf32>
  %68 = "mhlo.slice"(%56) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %69 = mhlo.tanh %68 : tensor<1x10xf32>
  %70 = mhlo.multiply %67, %69 : tensor<1x10xf32>
  %71 = mhlo.add %62, %70 : tensor<1x10xf32>
  %72 = mhlo.minimum %71, %2 : tensor<1x10xf32>
  %73 = mhlo.maximum %72, %1 : tensor<1x10xf32>
  %74 = "mhlo.select"(%51, %42, %73) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %75 = "mhlo.slice"(%56) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %76 = mhlo.multiply %75, %0 : tensor<1x10xf32>
  %77 = mhlo.tanh %76 : tensor<1x10xf32>
  %78 = mhlo.multiply %77, %0 : tensor<1x10xf32>
  %79 = mhlo.add %78, %0 : tensor<1x10xf32>
  %80 = mhlo.tanh %73 : tensor<1x10xf32>
  %81 = mhlo.multiply %79, %80 : tensor<1x10xf32>
  %82 = "mhlo.select"(%51, %43, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %83 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %84 = "mhlo.dynamic-update-slice"(%46, %83, %38, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%47, %39, %40, %41, %74, %82, %44, %45, %84 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%85: tensor<5x1x10xf32>):  // pred: ^bb1
  return %85 : tensor<5x1x10xf32>
}

// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0> : tensor<i32>
  %0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
  %1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_0 = arith.constant dense<1> : tensor<i32>
  %3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_2 = arith.constant dense<5> : tensor<i32>
  %cst_3 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
  %6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
  %7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  %8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  %9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  %10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  %12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
   reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
    %87 = mhlo.minimum %arg2, %arg3 : tensor<f32>
    "mhlo.return"(%87) : (tensor<f32>) -> ()
  }
  %13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
  %15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
  %16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %87 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%87) : (tensor<i32>) -> ()
  }
  %17 = mhlo.subtract %cst_2, %16 : tensor<i32>
  %18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %19 = linalg.init_tensor [5] : tensor<5xf32>
  %20 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%19 : tensor<5xf32>) : tensor<5xf32>
  %21 = "mhlo.compare"(%20, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %22 = mhlo.convert(%21) : (tensor<5xi1>) -> tensor<5xi32>
  %23 = mhlo.multiply %22, %cst_4 : tensor<5xi32>
  %24 = mhlo.reduce(%23 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
   reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
    %87 = mhlo.maximum %arg2, %arg3 : tensor<i32>
    "mhlo.return"(%87) : (tensor<i32>) -> ()
  }
  %25 = mhlo.subtract %cst_2, %24 : tensor<i32>
  %26 = "mhlo.select"(%18, %cst, %25) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
  %27 = mhlo.subtract %cst_2, %17 : tensor<i32>
  cf.br ^bb1(%26, %27, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%28: tensor<i32>, %29: tensor<i32>, %30: tensor<40xf32>, %31: tensor<74x40xf32>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x64xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %37 = "mhlo.compare"(%28, %29) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
  %38 = tensor.extract %37[] : tensor<i1>
  cf.cond_br %38, ^bb2(%28, %29, %30, %31, %32, %33, %34, %35, %36 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%36 : tensor<5x1x10xf32>)
^bb2(%39: tensor<i32>, %40: tensor<i32>, %41: tensor<40xf32>, %42: tensor<74x40xf32>, %43: tensor<1x10xf32>, %44: tensor<1x10xf32>, %45: tensor<5x1x64xf32>, %46: tensor<5x1x1xf32>, %47: tensor<5x1x10xf32>):  // pred: ^bb1
  %48 = mhlo.add %39, %cst_0 : tensor<i32>
  %49 = "mhlo.torch_index_select"(%46, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
  %50 = "mhlo.reshape"(%49) : (tensor<1x1xf32>) -> tensor<1xf32>
  %51 = "mhlo.broadcast_in_dim"(%50) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
  %52 = "mhlo.compare"(%51, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
  %53 = "mhlo.torch_index_select"(%45, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
  %54 = "mhlo.concatenate"(%53, %44) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
  %55 = "mhlo.dot"(%54, %42) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
  %56 = "mhlo.reshape"(%41) : (tensor<40xf32>) -> tensor<1x40xf32>
  %57 = mhlo.add %55, %56 : tensor<1x40xf32>
  %58 = "mhlo.slice"(%57) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %59 = mhlo.multiply %58, %0 : tensor<1x10xf32>
  %60 = mhlo.tanh %59 : tensor<1x10xf32>
  %61 = mhlo.multiply %60, %0 : tensor<1x10xf32>
  %62 = mhlo.add %61, %0 : tensor<1x10xf32>
  %63 = mhlo.multiply %62, %43 : tensor<1x10xf32>
  %64 = "mhlo.slice"(%57) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %65 = mhlo.multiply %64, %0 : tensor<1x10xf32>
  %66 = mhlo.tanh %65 : tensor<1x10xf32>
  %67 = mhlo.multiply %66, %0 : tensor<1x10xf32>
  %68 = mhlo.add %67, %0 : tensor<1x10xf32>
  %69 = "mhlo.slice"(%57) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %70 = mhlo.tanh %69 : tensor<1x10xf32>
  %71 = mhlo.multiply %68, %70 : tensor<1x10xf32>
  %72 = mhlo.add %63, %71 : tensor<1x10xf32>
  %73 = mhlo.minimum %72, %2 : tensor<1x10xf32>
  %74 = mhlo.maximum %73, %1 : tensor<1x10xf32>
  %75 = "mhlo.select"(%52, %43, %74) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %76 = "mhlo.slice"(%57) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
  %77 = mhlo.multiply %76, %0 : tensor<1x10xf32>
  %78 = mhlo.tanh %77 : tensor<1x10xf32>
  %79 = mhlo.multiply %78, %0 : tensor<1x10xf32>
  %80 = mhlo.add %79, %0 : tensor<1x10xf32>
  %81 = mhlo.tanh %74 : tensor<1x10xf32>
  %82 = mhlo.multiply %80, %81 : tensor<1x10xf32>
  %83 = "mhlo.select"(%52, %44, %82) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
  %84 = "mhlo.reshape"(%83) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
  %85 = "mhlo.dynamic-update-slice"(%47, %84, %39, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
  cf.br ^bb1(%48, %40, %41, %42, %75, %83, %45, %46, %85 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%86: tensor<5x1x10xf32>):  // pred: ^bb1
  return %86 : tensor<5x1x10xf32>
}

// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0> : tensor<i32>
  %cst_0 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_1 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_2 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<1> : tensor<i32>
  %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_7 = arith.constant dense<5> : tensor<i32>
  %cst_8 = arith.constant dense<-2147483648> : tensor<i32>
  %cst_9 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_10 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_11 = arith.constant dense<0x7F800000> : tensor<f32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<f32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %cst_14 = arith.constant 0.000000e+00 : f32
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_14 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %cst_15 = arith.constant 0x7F800000 : f32
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst_15 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %148 = arith.minf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %148 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %148 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %148 = arith.extui %arg2 : i1 to i32
    linalg.yield %148 : i32
  } -> tensor<5xi32>
  %14 = linalg.init_tensor [5] : tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %148 = arith.muli %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<5xi32>
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %16 = linalg.init_tensor [] : tensor<i32>
  %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %148 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_7, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %148 = arith.subi %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  %21 = linalg.init_tensor [] : tensor<i1>
  %22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_7 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %148 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %148 : i1
  } -> tensor<i1>
  %23 = linalg.init_tensor [5] : tensor<5xf32>
  %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
  %25 = linalg.init_tensor [5] : tensor<5xi1>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %148 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %148 : i1
  } -> tensor<5xi1>
  %27 = linalg.init_tensor [5] : tensor<5xi32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %148 = arith.extui %arg2 : i1 to i32
    linalg.yield %148 : i32
  } -> tensor<5xi32>
  %29 = linalg.init_tensor [5] : tensor<5xi32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %148 = arith.muli %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<5xi32>
  %c-2147483648_i32_16 = arith.constant -2147483648 : i32
  %31 = linalg.init_tensor [] : tensor<i32>
  %32 = linalg.fill ins(%c-2147483648_i32_16 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %148 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  %34 = linalg.init_tensor [] : tensor<i32>
  %35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_7, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %148 = arith.subi %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  %36 = linalg.init_tensor [] : tensor<i32>
  %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %148 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  %38 = linalg.init_tensor [] : tensor<i32>
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_7, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %148 = arith.subi %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  cf.br ^bb1(%37, %39, %cst_6, %cst_5, %cst_12, %cst_12, %3, %6, %cst_4 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %49 = linalg.init_tensor [] : tensor<i1>
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %148 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %148 : i1
  } -> tensor<i1>
  %51 = tensor.extract %50[] : tensor<i1>
  cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
  %61 = linalg.init_tensor [] : tensor<i32>
  %62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_3 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %148 = arith.addi %arg2, %arg3 : i32
    linalg.yield %148 : i32
  } -> tensor<i32>
  %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %64 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52, %63 : tensor<i32>, tensor<1x1xf32>) outs(%64 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
    %148 = arith.index_cast %arg2 : i32 to index
    %149 = linalg.index 0 : index
    %150 = linalg.index 1 : index
    %151 = tensor.extract %59[%148, %149, %150] : tensor<5x1x1xf32>
    linalg.yield %151 : f32
  } -> tensor<1x1xf32>
  %66 = tensor.collapse_shape %65 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %67 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1xf32>) outs(%67 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %69 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%69 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %148 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %148 : i1
  } -> tensor<1x10xi1>
  %71 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %72 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52, %71 : tensor<i32>, tensor<1x64xf32>) outs(%72 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
    %148 = arith.index_cast %arg2 : i32 to index
    %149 = linalg.index 0 : index
    %150 = linalg.index 1 : index
    %151 = tensor.extract %58[%148, %149, %150] : tensor<5x1x64xf32>
    linalg.yield %151 : f32
  } -> tensor<1x64xf32>
  %c0 = arith.constant 0 : index
  %c0_17 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c1_18 = arith.constant 1 : index
  %c0_19 = arith.constant 0 : index
  %c1_20 = arith.constant 1 : index
  %c64 = arith.constant 64 : index
  %c1_21 = arith.constant 1 : index
  %c0_22 = arith.constant 0 : index
  %c1_23 = arith.constant 1 : index
  %c64_24 = arith.constant 64 : index
  %c64_25 = arith.constant 64 : index
  %c1_26 = arith.constant 1 : index
  %c10 = arith.constant 10 : index
  %c74 = arith.constant 74 : index
  %74 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %c0_27 = arith.constant 0 : index
  %c1_28 = arith.constant 1 : index
  %c64_29 = arith.constant 64 : index
  %75 = tensor.insert_slice %73 into %74[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %76 = arith.addi %c0_27, %c64_29 : index
  %c1_30 = arith.constant 1 : index
  %c10_31 = arith.constant 10 : index
  %77 = tensor.insert_slice %57 into %75[0, %76] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %78 = arith.addi %76, %c10_31 : index
  %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %cst_32 = arith.constant 0.000000e+00 : f32
  %80 = linalg.fill ins(%cst_32 : f32) outs(%79 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %81 = linalg.matmul ins(%77, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%80 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %82 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %83 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %82 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%83 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.addf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x40xf32>
  %85 = tensor.extract_slice %84[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %148 = math.tanh %arg2 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %92 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%92 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.addf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %94 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %95 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%93, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%94 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %96 = tensor.extract_slice %84[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98 : tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %148 = math.tanh %arg2 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %101 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %102 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%101 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %103 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %104 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%102, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%103 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.addf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %105 = tensor.extract_slice %84[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%105 : tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %148 = math.tanh %arg2 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%104, %107 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%95, %109 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.addf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.minf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %114 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %115 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%113, %cst_1 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%114 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %116 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %117 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %56, %115 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%116 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %148 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %118 = tensor.extract_slice %84[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120 : tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %148 = math.tanh %arg2 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%124, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.addf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%115 : tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %148 = math.tanh %arg2 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %129 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %130 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%126, %128 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%129 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %148 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %131 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %132 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %57, %130 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%131 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %148 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %148 : f32
  } -> tensor<1x10xf32>
  %133 = tensor.expand_shape %132 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %c0_i32 = arith.constant 0 : i32
  %134 = tensor.extract %52[] : tensor<i32>
  %c4_i32 = arith.constant 4 : i32
  %135 = arith.maxsi %c0_i32, %134 : i32
  %136 = arith.minsi %135, %c4_i32 : i32
  %137 = arith.index_cast %136 : i32 to index
  %138 = tensor.extract %cst[] : tensor<i32>
  %c0_i32_33 = arith.constant 0 : i32
  %139 = arith.maxsi %c0_i32, %138 : i32
  %140 = arith.minsi %139, %c0_i32_33 : i32
  %141 = arith.index_cast %140 : i32 to index
  %142 = tensor.extract %cst[] : tensor<i32>
  %c0_i32_34 = arith.constant 0 : i32
  %143 = arith.maxsi %c0_i32, %142 : i32
  %144 = arith.minsi %143, %c0_i32_34 : i32
  %145 = arith.index_cast %144 : i32 to index
  %146 = tensor.insert_slice %133 into %60[%137, %141, %145] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%62, %53, %54, %55, %117, %132, %58, %59, %146 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%147: tensor<5x1x10xf32>):  // pred: ^bb1
  return %147 : tensor<5x1x10xf32>
}

// -----// IR Dump After ReconcileUnrealizedCasts //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0> : tensor<i32>
    %cst_0 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_1 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_2 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<1> : tensor<i32>
    %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_7 = arith.constant dense<5> : tensor<i32>
    %cst_8 = arith.constant dense<-2147483648> : tensor<i32>
    %cst_9 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_10 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_11 = arith.constant dense<0x7F800000> : tensor<f32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %cst_13 = arith.constant dense<0.000000e+00> : tensor<f32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %cst_14 = arith.constant 0.000000e+00 : f32
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_14 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %cst_15 = arith.constant 0x7F800000 : f32
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst_15 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %148 = arith.minf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %148 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %148 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %148 = arith.extui %arg2 : i1 to i32
      linalg.yield %148 : i32
    } -> tensor<5xi32>
    %14 = linalg.init_tensor [5] : tensor<5xi32>
    %15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %148 = arith.muli %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<5xi32>
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %16 = linalg.init_tensor [] : tensor<i32>
    %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %148 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i32>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_7, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %148 = arith.subi %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    %21 = linalg.init_tensor [] : tensor<i1>
    %22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_7 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %148 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %148 : i1
    } -> tensor<i1>
    %23 = linalg.init_tensor [5] : tensor<5xf32>
    %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
    %25 = linalg.init_tensor [5] : tensor<5xi1>
    %26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %148 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %148 : i1
    } -> tensor<5xi1>
    %27 = linalg.init_tensor [5] : tensor<5xi32>
    %28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %148 = arith.extui %arg2 : i1 to i32
      linalg.yield %148 : i32
    } -> tensor<5xi32>
    %29 = linalg.init_tensor [5] : tensor<5xi32>
    %30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %148 = arith.muli %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<5xi32>
    %c-2147483648_i32_16 = arith.constant -2147483648 : i32
    %31 = linalg.init_tensor [] : tensor<i32>
    %32 = linalg.fill ins(%c-2147483648_i32_16 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
    %33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %148 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    %34 = linalg.init_tensor [] : tensor<i32>
    %35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_7, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %148 = arith.subi %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %148 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    %38 = linalg.init_tensor [] : tensor<i32>
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_7, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %148 = arith.subi %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    cf.br ^bb1(%37, %39, %cst_6, %cst_5, %cst_12, %cst_12, %3, %6, %cst_4 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %49 = linalg.init_tensor [] : tensor<i1>
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %148 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %148 : i1
    } -> tensor<i1>
    %51 = tensor.extract %50[] : tensor<i1>
    cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
  ^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
    %61 = linalg.init_tensor [] : tensor<i32>
    %62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_3 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %148 = arith.addi %arg2, %arg3 : i32
      linalg.yield %148 : i32
    } -> tensor<i32>
    %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %64 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %65 = linalg.generic {indexing_maps = [#map8, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%52, %63 : tensor<i32>, tensor<1x1xf32>) outs(%64 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
      %148 = arith.index_cast %arg2 : i32 to index
      %149 = linalg.index 0 : index
      %150 = linalg.index 1 : index
      %151 = tensor.extract %59[%148, %149, %150] : tensor<5x1x1xf32>
      linalg.yield %151 : f32
    } -> tensor<1x1xf32>
    %66 = tensor.collapse_shape %65 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %67 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %68 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1xf32>) outs(%67 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %69 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %70 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%69 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %148 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %148 : i1
    } -> tensor<1x10xi1>
    %71 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %72 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %73 = linalg.generic {indexing_maps = [#map8, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%52, %71 : tensor<i32>, tensor<1x64xf32>) outs(%72 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
      %148 = arith.index_cast %arg2 : i32 to index
      %149 = linalg.index 0 : index
      %150 = linalg.index 1 : index
      %151 = tensor.extract %58[%148, %149, %150] : tensor<5x1x64xf32>
      linalg.yield %151 : f32
    } -> tensor<1x64xf32>
    %c0 = arith.constant 0 : index
    %c0_17 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c1_18 = arith.constant 1 : index
    %c0_19 = arith.constant 0 : index
    %c1_20 = arith.constant 1 : index
    %c64 = arith.constant 64 : index
    %c1_21 = arith.constant 1 : index
    %c0_22 = arith.constant 0 : index
    %c1_23 = arith.constant 1 : index
    %c64_24 = arith.constant 64 : index
    %c64_25 = arith.constant 64 : index
    %c1_26 = arith.constant 1 : index
    %c10 = arith.constant 10 : index
    %c74 = arith.constant 74 : index
    %74 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %c0_27 = arith.constant 0 : index
    %c1_28 = arith.constant 1 : index
    %c64_29 = arith.constant 64 : index
    %75 = tensor.insert_slice %73 into %74[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %76 = arith.addi %c0_27, %c64_29 : index
    %c1_30 = arith.constant 1 : index
    %c10_31 = arith.constant 10 : index
    %77 = tensor.insert_slice %57 into %75[0, %76] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %78 = arith.addi %76, %c10_31 : index
    %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %cst_32 = arith.constant 0.000000e+00 : f32
    %80 = linalg.fill ins(%cst_32 : f32) outs(%79 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %81 = linalg.matmul ins(%77, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%80 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %82 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %83 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %82 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%83 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.addf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x40xf32>
    %85 = tensor.extract_slice %84[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %148 = math.tanh %arg2 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %92 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %93 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%92 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.addf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %94 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %95 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%93, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%94 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %96 = tensor.extract_slice %84[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %100 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98 : tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %148 = math.tanh %arg2 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %101 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %102 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%101 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %103 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %104 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%102, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%103 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.addf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %105 = tensor.extract_slice %84[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %107 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%105 : tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %148 = math.tanh %arg2 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%104, %107 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%95, %109 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.addf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %113 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.minf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %114 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %115 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%113, %cst_1 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%114 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %116 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %117 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %56, %115 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%116 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %148 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %118 = tensor.extract_slice %84[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %122 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120 : tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %148 = math.tanh %arg2 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %124 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%124, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.addf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %128 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%115 : tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %148 = math.tanh %arg2 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %129 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %130 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%126, %128 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%129 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %148 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %131 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %132 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %57, %130 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%131 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %148 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %148 : f32
    } -> tensor<1x10xf32>
    %133 = tensor.expand_shape %132 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %c0_i32 = arith.constant 0 : i32
    %134 = tensor.extract %52[] : tensor<i32>
    %c4_i32 = arith.constant 4 : i32
    %135 = arith.maxsi %c0_i32, %134 : i32
    %136 = arith.minsi %135, %c4_i32 : i32
    %137 = arith.index_cast %136 : i32 to index
    %138 = tensor.extract %cst[] : tensor<i32>
    %c0_i32_33 = arith.constant 0 : i32
    %139 = arith.maxsi %c0_i32, %138 : i32
    %140 = arith.minsi %139, %c0_i32_33 : i32
    %141 = arith.index_cast %140 : i32 to index
    %142 = tensor.extract %cst[] : tensor<i32>
    %c0_i32_34 = arith.constant 0 : i32
    %143 = arith.maxsi %c0_i32, %142 : i32
    %144 = arith.minsi %143, %c0_i32_34 : i32
    %145 = arith.index_cast %144 : i32 to index
    %146 = tensor.insert_slice %133 into %60[%137, %141, %145] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%62, %53, %54, %55, %117, %132, %58, %59, %146 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%147: tensor<5x1x10xf32>):  // pred: ^bb1
    return %147 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = arith.minf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %136 = arith.extui %arg2 : i1 to i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %14 = linalg.init_tensor [5] : tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.muli %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %16 = linalg.init_tensor [] : tensor<i32>
  %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %136 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %21 = linalg.init_tensor [] : tensor<i1>
  %22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %136 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %136 : i1
  } -> tensor<i1>
  %23 = linalg.init_tensor [5] : tensor<5xf32>
  %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
  %25 = linalg.init_tensor [5] : tensor<5xi1>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<5xi1>
  %27 = linalg.init_tensor [5] : tensor<5xi32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %136 = arith.extui %arg2 : i1 to i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %29 = linalg.init_tensor [5] : tensor<5xi32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.muli %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %31 = linalg.init_tensor [] : tensor<i32>
  %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %136 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %34 = linalg.init_tensor [] : tensor<i32>
  %35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %36 = linalg.init_tensor [] : tensor<i32>
  %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %136 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %38 = linalg.init_tensor [] : tensor<i32>
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %49 = linalg.init_tensor [] : tensor<i1>
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %136 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %136 : i1
  } -> tensor<i1>
  %51 = tensor.extract %50[] : tensor<i1>
  cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
  %61 = linalg.init_tensor [] : tensor<i32>
  %62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.addi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %136 = arith.index_cast %arg2 : i32 to index
    %137 = linalg.index 0 : index
    %138 = linalg.index 1 : index
    %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
    linalg.yield %139 : f32
  } -> tensor<1x1xf32>
  %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<1x10xi1>
  %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %136 = arith.index_cast %arg2 : i32 to index
    %137 = linalg.index 0 : index
    %138 = linalg.index 1 : index
    %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
    linalg.yield %139 : f32
  } -> tensor<1x64xf32>
  %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x40xf32>
  %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %96 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.minf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %136 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %116 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %118 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %136 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %130 = tensor.extract %52[] : tensor<i32>
  %131 = arith.maxsi %130, %c0_i32 : i32
  %132 = arith.minsi %131, %c4_i32 : i32
  %133 = arith.index_cast %132 : i32 to index
  %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
  return %135 : tensor<5x1x10xf32>
}

// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %14 = linalg.init_tensor [5] : tensor<5xi32>
    %15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %16 = linalg.init_tensor [] : tensor<i32>
    %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i32>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %21 = linalg.init_tensor [] : tensor<i1>
    %22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %23 = linalg.init_tensor [5] : tensor<5xf32>
    %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
    %25 = linalg.init_tensor [5] : tensor<5xi1>
    %26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %27 = linalg.init_tensor [5] : tensor<5xi32>
    %28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %29 = linalg.init_tensor [5] : tensor<5xi32>
    %30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %31 = linalg.init_tensor [] : tensor<i32>
    %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
    %33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %34 = linalg.init_tensor [] : tensor<i32>
    %35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %136 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %38 = linalg.init_tensor [] : tensor<i32>
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %49 = linalg.init_tensor [] : tensor<i1>
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %51 = tensor.extract %50[] : tensor<i1>
    cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
  ^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
    %61 = linalg.init_tensor [] : tensor<i32>
    %62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.addi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
      linalg.yield %139 : f32
    } -> tensor<1x1xf32>
    %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<1x10xi1>
    %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
      linalg.yield %139 : f32
    } -> tensor<1x64xf32>
    %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x40xf32>
    %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %130 = tensor.extract %52[] : tensor<i32>
    %131 = arith.maxsi %130, %c0_i32 : i32
    %132 = arith.minsi %131, %c4_i32 : i32
    %133 = arith.index_cast %132 : i32 to index
    %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
    return %135 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After IREEImportPublic //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %14 = linalg.init_tensor [5] : tensor<5xi32>
    %15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %16 = linalg.init_tensor [] : tensor<i32>
    %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i32>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %21 = linalg.init_tensor [] : tensor<i1>
    %22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %23 = linalg.init_tensor [5] : tensor<5xf32>
    %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
    %25 = linalg.init_tensor [5] : tensor<5xi1>
    %26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %27 = linalg.init_tensor [5] : tensor<5xi32>
    %28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %29 = linalg.init_tensor [5] : tensor<5xi32>
    %30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %31 = linalg.init_tensor [] : tensor<i32>
    %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
    %33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %34 = linalg.init_tensor [] : tensor<i32>
    %35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %136 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %38 = linalg.init_tensor [] : tensor<i32>
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %49 = linalg.init_tensor [] : tensor<i1>
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %51 = tensor.extract %50[] : tensor<i1>
    cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
  ^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
    %61 = linalg.init_tensor [] : tensor<i32>
    %62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.addi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
      linalg.yield %139 : f32
    } -> tensor<1x1xf32>
    %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<1x10xi1>
    %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
      linalg.yield %139 : f32
    } -> tensor<1x64xf32>
    %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x40xf32>
    %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %130 = tensor.extract %52[] : tensor<i32>
    %131 = arith.maxsi %130, %c0_i32 : i32
    %132 = arith.minsi %131, %c4_i32 : i32
    %133 = arith.index_cast %132 : i32 to index
    %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
    return %135 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After SanitizeModuleNames //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %14 = linalg.init_tensor [5] : tensor<5xi32>
    %15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %16 = linalg.init_tensor [] : tensor<i32>
    %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i32>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %21 = linalg.init_tensor [] : tensor<i1>
    %22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %23 = linalg.init_tensor [5] : tensor<5xf32>
    %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
    %25 = linalg.init_tensor [5] : tensor<5xi1>
    %26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %27 = linalg.init_tensor [5] : tensor<5xi32>
    %28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %29 = linalg.init_tensor [5] : tensor<5xi32>
    %30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %31 = linalg.init_tensor [] : tensor<i32>
    %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
    %33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %34 = linalg.init_tensor [] : tensor<i32>
    %35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %136 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %38 = linalg.init_tensor [] : tensor<i32>
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %49 = linalg.init_tensor [] : tensor<i1>
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %51 = tensor.extract %50[] : tensor<i1>
    cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
  ^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
    %61 = linalg.init_tensor [] : tensor<i32>
    %62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.addi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
      linalg.yield %139 : f32
    } -> tensor<1x1xf32>
    %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<1x10xi1>
    %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
      linalg.yield %139 : f32
    } -> tensor<1x64xf32>
    %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x40xf32>
    %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %130 = tensor.extract %52[] : tensor<i32>
    %131 = arith.maxsi %130, %c0_i32 : i32
    %132 = arith.minsi %131, %c4_i32 : i32
    %133 = arith.index_cast %132 : i32 to index
    %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
    return %135 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %14 = linalg.init_tensor [5] : tensor<5xi32>
    %15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %16 = linalg.init_tensor [] : tensor<i32>
    %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i32>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %21 = linalg.init_tensor [] : tensor<i1>
    %22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %23 = linalg.init_tensor [5] : tensor<5xf32>
    %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
    %25 = linalg.init_tensor [5] : tensor<5xi1>
    %26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %27 = linalg.init_tensor [5] : tensor<5xi32>
    %28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %29 = linalg.init_tensor [5] : tensor<5xi32>
    %30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %31 = linalg.init_tensor [] : tensor<i32>
    %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
    %33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %34 = linalg.init_tensor [] : tensor<i32>
    %35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %136 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %38 = linalg.init_tensor [] : tensor<i32>
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %49 = linalg.init_tensor [] : tensor<i1>
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %51 = tensor.extract %50[] : tensor<i1>
    cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
  ^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
    %61 = linalg.init_tensor [] : tensor<i32>
    %62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.addi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
      linalg.yield %139 : f32
    } -> tensor<1x1xf32>
    %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<1x10xi1>
    %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
      linalg.yield %139 : f32
    } -> tensor<1x64xf32>
    %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x40xf32>
    %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %130 = tensor.extract %52[] : tensor<i32>
    %131 = arith.maxsi %130, %c0_i32 : i32
    %132 = arith.minsi %131, %c4_i32 : i32
    %133 = arith.index_cast %132 : i32 to index
    %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
    return %135 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = arith.minf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %136 = arith.extui %arg2 : i1 to i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %14 = linalg.init_tensor [5] : tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.muli %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %16 = linalg.init_tensor [] : tensor<i32>
  %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %136 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %21 = linalg.init_tensor [] : tensor<i1>
  %22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %136 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %136 : i1
  } -> tensor<i1>
  %23 = linalg.init_tensor [5] : tensor<5xf32>
  %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
  %25 = linalg.init_tensor [5] : tensor<5xi1>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<5xi1>
  %27 = linalg.init_tensor [5] : tensor<5xi32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %136 = arith.extui %arg2 : i1 to i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %29 = linalg.init_tensor [5] : tensor<5xi32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.muli %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %31 = linalg.init_tensor [] : tensor<i32>
  %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %136 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %34 = linalg.init_tensor [] : tensor<i32>
  %35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %36 = linalg.init_tensor [] : tensor<i32>
  %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %136 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %38 = linalg.init_tensor [] : tensor<i32>
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %49 = linalg.init_tensor [] : tensor<i1>
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %136 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %136 : i1
  } -> tensor<i1>
  %51 = tensor.extract %50[] : tensor<i1>
  cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
  %61 = linalg.init_tensor [] : tensor<i32>
  %62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.addi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %136 = arith.index_cast %arg2 : i32 to index
    %137 = linalg.index 0 : index
    %138 = linalg.index 1 : index
    %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
    linalg.yield %139 : f32
  } -> tensor<1x1xf32>
  %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<1x10xi1>
  %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %136 = arith.index_cast %arg2 : i32 to index
    %137 = linalg.index 0 : index
    %138 = linalg.index 1 : index
    %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
    linalg.yield %139 : f32
  } -> tensor<1x64xf32>
  %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x40xf32>
  %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %96 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.minf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %136 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %116 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %118 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %136 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %130 = tensor.extract %52[] : tensor<i32>
  %131 = arith.maxsi %130, %c0_i32 : i32
  %132 = arith.minsi %131, %c4_i32 : i32
  %133 = arith.index_cast %132 : i32 to index
  %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
  return %135 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Inliner //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %14 = linalg.init_tensor [5] : tensor<5xi32>
    %15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %16 = linalg.init_tensor [] : tensor<i32>
    %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i32>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %21 = linalg.init_tensor [] : tensor<i1>
    %22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %23 = linalg.init_tensor [5] : tensor<5xf32>
    %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
    %25 = linalg.init_tensor [5] : tensor<5xi1>
    %26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<5xi1>
    %27 = linalg.init_tensor [5] : tensor<5xi32>
    %28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %136 = arith.extui %arg2 : i1 to i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %29 = linalg.init_tensor [5] : tensor<5xi32>
    %30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.muli %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<5xi32>
    %31 = linalg.init_tensor [] : tensor<i32>
    %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
    %33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %136 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %34 = linalg.init_tensor [] : tensor<i32>
    %35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %136 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %38 = linalg.init_tensor [] : tensor<i32>
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.subi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %49 = linalg.init_tensor [] : tensor<i1>
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %136 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %136 : i1
    } -> tensor<i1>
    %51 = tensor.extract %50[] : tensor<i1>
    cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
  ^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
    %61 = linalg.init_tensor [] : tensor<i32>
    %62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %136 = arith.addi %arg2, %arg3 : i32
      linalg.yield %136 : i32
    } -> tensor<i32>
    %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
      linalg.yield %139 : f32
    } -> tensor<1x1xf32>
    %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %136 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %136 : i1
    } -> tensor<1x10xi1>
    %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %136 = arith.index_cast %arg2 : i32 to index
      %137 = linalg.index 0 : index
      %138 = linalg.index 1 : index
      %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
      linalg.yield %139 : f32
    } -> tensor<1x64xf32>
    %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x40xf32>
    %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.minf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.addf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %136 = math.tanh %arg2 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %136 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %136 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %136 : f32
    } -> tensor<1x10xf32>
    %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %130 = tensor.extract %52[] : tensor<i32>
    %131 = arith.maxsi %130, %c0_i32 : i32
    %132 = arith.minsi %131, %c4_i32 : i32
    %133 = arith.index_cast %132 : i32 to index
    %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
    return %135 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = arith.minf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %136 = arith.extui %arg2 : i1 to i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %14 = linalg.init_tensor [5] : tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.muli %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %16 = linalg.init_tensor [] : tensor<i32>
  %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %136 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %21 = linalg.init_tensor [] : tensor<i1>
  %22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %136 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %136 : i1
  } -> tensor<i1>
  %23 = linalg.init_tensor [5] : tensor<5xf32>
  %24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
  %25 = linalg.init_tensor [5] : tensor<5xi1>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<5xi1>
  %27 = linalg.init_tensor [5] : tensor<5xi32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %136 = arith.extui %arg2 : i1 to i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %29 = linalg.init_tensor [5] : tensor<5xi32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.muli %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<5xi32>
  %31 = linalg.init_tensor [] : tensor<i32>
  %32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %136 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %34 = linalg.init_tensor [] : tensor<i32>
  %35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %36 = linalg.init_tensor [] : tensor<i32>
  %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %136 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %38 = linalg.init_tensor [] : tensor<i32>
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.subi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %49 = linalg.init_tensor [] : tensor<i1>
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %136 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %136 : i1
  } -> tensor<i1>
  %51 = tensor.extract %50[] : tensor<i1>
  cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>):  // pred: ^bb1
  %61 = linalg.init_tensor [] : tensor<i32>
  %62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %136 = arith.addi %arg2, %arg3 : i32
    linalg.yield %136 : i32
  } -> tensor<i32>
  %63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %136 = arith.index_cast %arg2 : i32 to index
    %137 = linalg.index 0 : index
    %138 = linalg.index 1 : index
    %139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
    linalg.yield %139 : f32
  } -> tensor<1x1xf32>
  %65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %136 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %136 : i1
  } -> tensor<1x10xi1>
  %70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %136 = arith.index_cast %arg2 : i32 to index
    %137 = linalg.index 0 : index
    %138 = linalg.index 1 : index
    %139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
    linalg.yield %139 : f32
  } -> tensor<1x64xf32>
  %72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x40xf32>
  %81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %96 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.minf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %136 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %116 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %118 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.addf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %136 = math.tanh %arg2 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %136 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %136 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %136 : f32
  } -> tensor<1x10xf32>
  %129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %130 = tensor.extract %52[] : tensor<i32>
  %131 = arith.maxsi %130, %c0_i32 : i32
  %132 = arith.minsi %131, %c4_i32 : i32
  %133 = arith.index_cast %132 : i32 to index
  %134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>):  // pred: ^bb1
  return %135 : tensor<5x1x10xf32>
}

// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %15 = linalg.init_tensor [] : tensor<i32>
  %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i1>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %101 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.addi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
    linalg.yield %104 : f32
  } -> tensor<1x1xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<1x10xi1>
  %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
    linalg.yield %104 : f32
  } -> tensor<1x64xf32>
  %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x40xf32>
  %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %95 = tensor.extract %41[] : tensor<i32>
  %96 = arith.maxsi %95, %c0_i32 : i32
  %97 = arith.minsi %96, %c4_i32 : i32
  %98 = arith.index_cast %97 : i32 to index
  %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
  return %100 : tensor<5x1x10xf32>
}

// -----// IR Dump After SymbolDCE //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %101 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %40 = tensor.extract %39[] : tensor<i1>
    cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
  ^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.addi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
      linalg.yield %104 : f32
    } -> tensor<1x1xf32>
    %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<1x10xi1>
    %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
      linalg.yield %104 : f32
    } -> tensor<1x64xf32>
    %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x40xf32>
    %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %95 = tensor.extract %41[] : tensor<i32>
    %96 = arith.maxsi %95, %c0_i32 : i32
    %97 = arith.minsi %96, %c4_i32 : i32
    %98 = arith.index_cast %97 : i32 to index
    %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
    return %100 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %101 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %40 = tensor.extract %39[] : tensor<i1>
    cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
  ^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.addi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
      linalg.yield %104 : f32
    } -> tensor<1x1xf32>
    %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<1x10xi1>
    %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
      linalg.yield %104 : f32
    } -> tensor<1x64xf32>
    %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x40xf32>
    %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %95 = tensor.extract %41[] : tensor<i32>
    %96 = arith.maxsi %95, %c0_i32 : i32
    %97 = arith.minsi %96, %c4_i32 : i32
    %98 = arith.index_cast %97 : i32 to index
    %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
    return %100 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After VerifyInputLegality //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %15 = linalg.init_tensor [] : tensor<i32>
  %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i1>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %101 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.addi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
    linalg.yield %104 : f32
  } -> tensor<1x1xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<1x10xi1>
  %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
    linalg.yield %104 : f32
  } -> tensor<1x64xf32>
  %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x40xf32>
  %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %95 = tensor.extract %41[] : tensor<i32>
  %96 = arith.maxsi %95, %c0_i32 : i32
  %97 = arith.minsi %96, %c4_i32 : i32
  %98 = arith.index_cast %97 : i32 to index
  %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
  return %100 : tensor<5x1x10xf32>
}

// -----// IR Dump After VerifyInputLegality //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %15 = linalg.init_tensor [] : tensor<i32>
  %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i1>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %101 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.addi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
    linalg.yield %104 : f32
  } -> tensor<1x1xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<1x10xi1>
  %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
    linalg.yield %104 : f32
  } -> tensor<1x64xf32>
  %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x40xf32>
  %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %95 = tensor.extract %41[] : tensor<i32>
  %96 = arith.maxsi %95, %c0_i32 : i32
  %97 = arith.minsi %96, %c4_i32 : i32
  %98 = arith.index_cast %97 : i32 to index
  %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
  return %100 : tensor<5x1x10xf32>
}

// -----// IR Dump After LinalgNamedOpConversion //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %101 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %40 = tensor.extract %39[] : tensor<i1>
    cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
  ^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.addi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
      linalg.yield %104 : f32
    } -> tensor<1x1xf32>
    %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<1x10xi1>
    %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
      linalg.yield %104 : f32
    } -> tensor<1x64xf32>
    %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x40xf32>
    %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %95 = tensor.extract %41[] : tensor<i32>
    %96 = arith.maxsi %95, %c0_i32 : i32
    %97 = arith.minsi %96, %c4_i32 : i32
    %98 = arith.index_cast %97 : i32 to index
    %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
    return %100 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After ExpandTensorShapes //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst = arith.constant 0x7F800000 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<0> : tensor<i32>
    %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1> : tensor<i32>
    %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_0 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %101 = arith.extui %arg2 : i1 to i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.muli %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %101 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %101 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.subi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %101 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %101 : i1
    } -> tensor<i1>
    %40 = tensor.extract %39[] : tensor<i1>
    cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
  ^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
    %50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %101 = arith.addi %arg2, %arg3 : i32
      linalg.yield %101 : i32
    } -> tensor<i32>
    %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
      linalg.yield %104 : f32
    } -> tensor<1x1xf32>
    %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %101 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %101 : i1
    } -> tensor<1x10xi1>
    %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %101 = arith.index_cast %arg2 : i32 to index
      %102 = linalg.index 0 : index
      %103 = linalg.index 1 : index
      %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
      linalg.yield %104 : f32
    } -> tensor<1x64xf32>
    %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x40xf32>
    %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.minf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.addf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %101 = math.tanh %arg2 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %101 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %101 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %101 : f32
    } -> tensor<1x10xf32>
    %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %95 = tensor.extract %41[] : tensor<i32>
    %96 = arith.maxsi %95, %c0_i32 : i32
    %97 = arith.minsi %96, %c4_i32 : i32
    %98 = arith.index_cast %97 : i32 to index
    %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
    return %100 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst = arith.constant 0x7F800000 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<0> : tensor<i32>
  %cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1> : tensor<i32>
  %cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_0 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %15 = linalg.init_tensor [] : tensor<i32>
  %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i1>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<5xi1>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %101 = arith.extui %arg2 : i1 to i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.muli %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<5xi32>
  %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %101 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %101 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.subi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %101 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %101 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>):  // pred: ^bb1
  %50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %101 = arith.addi %arg2, %arg3 : i32
    linalg.yield %101 : i32
  } -> tensor<i32>
  %51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
    linalg.yield %104 : f32
  } -> tensor<1x1xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %101 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %101 : i1
  } -> tensor<1x10xi1>
  %58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %101 = arith.index_cast %arg2 : i32 to index
    %102 = linalg.index 0 : index
    %103 = linalg.index 1 : index
    %104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
    linalg.yield %104 : f32
  } -> tensor<1x64xf32>
  %60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x40xf32>
  %68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.minf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.addf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %101 = math.tanh %arg2 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %101 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %101 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %101 : f32
  } -> tensor<1x10xf32>
  %94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %95 = tensor.extract %41[] : tensor<i32>
  %96 = arith.maxsi %95, %c0_i32 : i32
  %97 = arith.minsi %96, %c4_i32 : i32
  %98 = arith.index_cast %97 : i32 to index
  %99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>):  // pred: ^bb1
  return %100 : tensor<5x1x10xf32>
}

// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_0 = arith.constant 0x7F800000 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %cst_2 = arith.constant dense<0> : tensor<i32>
    %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<1> : tensor<i32>
    %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_1 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = arith.minf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %85 = arith.extui %arg2 : i1 to i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.muli %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %85 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %85 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %85 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %85 = arith.extui %arg2 : i1 to i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.muli %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %85 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %85 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %34 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %85 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %85 : i1
    } -> tensor<i1>
    %35 = tensor.extract %34[] : tensor<i1>
    cf.cond_br %35, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %36 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.addi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %38 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %85 = arith.index_cast %arg2 : i32 to index
      %86 = linalg.index 0 : index
      %87 = linalg.index 1 : index
      %88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
      linalg.yield %88 : f32
    } -> tensor<1x1xf32>
    %39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %41 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %43 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<1x10xi1>
    %44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %45 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %85 = arith.index_cast %arg2 : i32 to index
      %86 = linalg.index 0 : index
      %87 = linalg.index 1 : index
      %88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
      linalg.yield %88 : f32
    } -> tensor<1x64xf32>
    %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %52 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x40xf32>
    %53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %54 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %58 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %60 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %61 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %62 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %63 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %65 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %66 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.minf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %85 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %74 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %85 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %80 = tensor.extract %30[] : tensor<i32>
    %81 = arith.maxsi %80, %c0_i32 : i32
    %82 = arith.minsi %81, %c4_i32 : i32
    %83 = arith.index_cast %82 : i32 to index
    %84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
  ^bb3:  // pred: ^bb1
    return %33 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_0 = arith.constant 0x7F800000 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %cst_2 = arith.constant dense<0> : tensor<i32>
    %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<1> : tensor<i32>
    %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_1 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = arith.minf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %85 = arith.extui %arg2 : i1 to i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.muli %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %85 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %85 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %85 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %85 = arith.extui %arg2 : i1 to i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.muli %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %85 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %85 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %34 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %85 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %85 : i1
    } -> tensor<i1>
    %35 = tensor.extract %34[] : tensor<i1>
    cf.cond_br %35, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %36 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.addi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %38 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %85 = arith.index_cast %arg2 : i32 to index
      %86 = linalg.index 0 : index
      %87 = linalg.index 1 : index
      %88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
      linalg.yield %88 : f32
    } -> tensor<1x1xf32>
    %39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %41 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %43 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<1x10xi1>
    %44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %45 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %85 = arith.index_cast %arg2 : i32 to index
      %86 = linalg.index 0 : index
      %87 = linalg.index 1 : index
      %88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
      linalg.yield %88 : f32
    } -> tensor<1x64xf32>
    %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %52 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x40xf32>
    %53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %54 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %58 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %60 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %61 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %62 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %63 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %65 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %66 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.minf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %85 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %74 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %85 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %80 = tensor.extract %30[] : tensor<i32>
    %81 = arith.maxsi %80, %c0_i32 : i32
    %82 = arith.minsi %81, %c4_i32 : i32
    %83 = arith.index_cast %82 : i32 to index
    %84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
  ^bb3:  // pred: ^bb1
    return %33 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant 0.000000e+00 : f32
  %cst_2 = arith.constant dense<0> : tensor<i32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<1> : tensor<i32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_1 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = arith.minf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %85 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %85 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %85 = arith.extui %arg2 : i1 to i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.muli %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %15 = linalg.init_tensor [] : tensor<i32>
  %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %85 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.subi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i1>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %85 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %85 : i1
  } -> tensor<i1>
  %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %85 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %85 : i1
  } -> tensor<5xi1>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %85 = arith.extui %arg2 : i1 to i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.muli %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %85 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.subi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %85 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.subi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %85 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %85 : i1
  } -> tensor<i1>
  %35 = tensor.extract %34[] : tensor<i1>
  cf.cond_br %35, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.addi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %85 = arith.index_cast %arg2 : i32 to index
    %86 = linalg.index 0 : index
    %87 = linalg.index 1 : index
    %88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
    linalg.yield %88 : f32
  } -> tensor<1x1xf32>
  %39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %85 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %85 : i1
  } -> tensor<1x10xi1>
  %44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %85 = arith.index_cast %arg2 : i32 to index
    %86 = linalg.index 0 : index
    %87 = linalg.index 1 : index
    %88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
    linalg.yield %88 : f32
  } -> tensor<1x64xf32>
  %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x40xf32>
  %53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %61 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %66 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.minf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %85 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %85 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %80 = tensor.extract %30[] : tensor<i32>
  %81 = arith.maxsi %80, %c0_i32 : i32
  %82 = arith.minsi %81, %c4_i32 : i32
  %83 = arith.index_cast %82 : i32 to index
  %84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %33 : tensor<5x1x10xf32>
}

// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant 0.000000e+00 : f32
  %cst_2 = arith.constant dense<0> : tensor<i32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<1> : tensor<i32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
  ^bb0(%arg2: index, %arg3: index, %arg4: index):
    tensor.yield %cst_1 : f32
  } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
  %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %7 = linalg.init_tensor [5] : tensor<5xf32>
  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = arith.minf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<5xf32>
  %10 = linalg.init_tensor [5] : tensor<5xi1>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %85 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %85 : i1
  } -> tensor<5xi1>
  %12 = linalg.init_tensor [5] : tensor<5xi32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %85 = arith.extui %arg2 : i1 to i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.muli %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %15 = linalg.init_tensor [] : tensor<i32>
  %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %85 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.subi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %19 = linalg.init_tensor [] : tensor<i1>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %85 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %85 : i1
  } -> tensor<i1>
  %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %85 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %85 : i1
  } -> tensor<5xi1>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %85 = arith.extui %arg2 : i1 to i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.muli %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<5xi32>
  %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %85 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.subi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %85 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.subi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %85 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %85 : i1
  } -> tensor<i1>
  %35 = tensor.extract %34[] : tensor<i1>
  cf.cond_br %35, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %85 = arith.addi %arg2, %arg3 : i32
    linalg.yield %85 : i32
  } -> tensor<i32>
  %37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %85 = arith.index_cast %arg2 : i32 to index
    %86 = linalg.index 0 : index
    %87 = linalg.index 1 : index
    %88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
    linalg.yield %88 : f32
  } -> tensor<1x1xf32>
  %39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %85 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %85 : i1
  } -> tensor<1x10xi1>
  %44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %85 = arith.index_cast %arg2 : i32 to index
    %86 = linalg.index 0 : index
    %87 = linalg.index 1 : index
    %88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
    linalg.yield %88 : f32
  } -> tensor<1x64xf32>
  %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x40xf32>
  %53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %61 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %66 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.minf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %85 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.addf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %85 = math.tanh %arg2 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %85 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %85 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %85 : f32
  } -> tensor<1x10xf32>
  %79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %80 = tensor.extract %30[] : tensor<i32>
  %81 = arith.maxsi %80, %c0_i32 : i32
  %82 = arith.minsi %81, %c4_i32 : i32
  %83 = arith.index_cast %82 : i32 to index
  %84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %33 : tensor<5x1x10xf32>
}

// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
    %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
    %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
    %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_0 = arith.constant 0x7F800000 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %cst_2 = arith.constant dense<0> : tensor<i32>
    %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
    %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
    %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
    %cst_6 = arith.constant dense<1> : tensor<i32>
    %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
    %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_9 = arith.constant dense<5> : tensor<i32>
    %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
    %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
    %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    %1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
    ^bb0(%arg2: index, %arg3: index, %arg4: index):
      tensor.yield %cst_1 : f32
    } : tensor<1x5x4xf32> to tensor<1x5x64xf32>
    %2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
    %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1x64xf32>
    %4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
    %5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<5x1xf32>
    %6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
    %7 = linalg.init_tensor [5] : tensor<5xf32>
    %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
    %9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = arith.minf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<5xf32>
    %10 = linalg.init_tensor [5] : tensor<5xi1>
    %11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<5xi1>
    %12 = linalg.init_tensor [5] : tensor<5xi32>
    %13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %85 = arith.extui %arg2 : i1 to i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.muli %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %15 = linalg.init_tensor [] : tensor<i32>
    %16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %85 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %19 = linalg.init_tensor [] : tensor<i1>
    %20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %85 = arith.cmpi eq, %arg2, %arg3 : i32
      linalg.yield %85 : i1
    } -> tensor<i1>
    %21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
    %22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf oeq, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<5xi1>
    %23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i1, %arg3: i32):
      %85 = arith.extui %arg2 : i1 to i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.muli %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<5xi32>
    %25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
    %26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32):
      %85 = arith.maxsi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
      %85 = arith.select %arg2, %arg3, %arg4 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.subi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
  ^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %34 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
      %85 = arith.cmpi slt, %arg2, %arg3 : i32
      linalg.yield %85 : i1
    } -> tensor<i1>
    %35 = tensor.extract %34[] : tensor<i1>
    cf.cond_br %35, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %36 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
      %85 = arith.addi %arg2, %arg3 : i32
      linalg.yield %85 : i32
    } -> tensor<i32>
    %37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
    %38 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %85 = arith.index_cast %arg2 : i32 to index
      %86 = linalg.index 0 : index
      %87 = linalg.index 1 : index
      %88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
      linalg.yield %88 : f32
    } -> tensor<1x1xf32>
    %39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
    %40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %41 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      linalg.yield %arg2 : f32
    } -> tensor<1x10xf32>
    %42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
    %43 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
      %85 = arith.cmpf ogt, %arg2, %arg3 : f32
      linalg.yield %85 : i1
    } -> tensor<1x10xi1>
    %44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
    %45 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
    ^bb0(%arg2: i32, %arg3: f32):
      %85 = arith.index_cast %arg2 : i32 to index
      %86 = linalg.index 0 : index
      %87 = linalg.index 1 : index
      %88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
      linalg.yield %88 : f32
    } -> tensor<1x64xf32>
    %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
    %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
    %48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
    %49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %52 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x40xf32>
    %53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %54 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %55 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %56 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %58 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %60 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %61 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %62 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %63 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %65 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %66 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %68 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.minf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.maxf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %70 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %85 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
    %72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %73 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %74 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.addf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32):
      %85 = math.tanh %arg2 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %85 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %78 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
    ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
      %85 = arith.select %arg2, %arg3, %arg4 : f32
      linalg.yield %85 : f32
    } -> tensor<1x10xf32>
    %79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
    %80 = tensor.extract %30[] : tensor<i32>
    %81 = arith.maxsi %80, %c0_i32 : i32
    %82 = arith.minsi %81, %c4_i32 : i32
    %83 = arith.index_cast %82 : i32 to index
    %84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
    cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
  ^bb3:  // pred: ^bb1
    return %33 : tensor<5x1x10xf32>
  }
}


// -----// IR Dump After PadTensorToSubTensorInsert //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After InterchangeGenericOps //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ResolveShapedTypeResultDims //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After FusionOfTensorOps //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After SplitReduction //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After InterchangeGenericOps //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After DispatchLinalgOnTensors //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CaptureDispatchDynamicDims //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
  %2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
  %3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After PadTensorToSubTensorInsert //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant 0.000000e+00 : f32
  %cst_2 = arith.constant dense<0> : tensor<i32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<1> : tensor<i32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %2 = linalg.fill ins(%cst_1 : f32) outs(%1 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %3 = tensor.insert_slice %0 into %2[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<1x5x4xf32> into tensor<1x5x64xf32>
  %4 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x5x64xf32>) outs(%4 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %6 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%6 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %8 = tensor.expand_shape %7 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %9 = linalg.init_tensor [5] : tensor<5xf32>
  %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<5xf32>) -> tensor<5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8 : tensor<5x1x1xf32>) outs(%10 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = arith.minf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<5xf32>
  %12 = linalg.init_tensor [5] : tensor<5xi1>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %87 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %87 : i1
  } -> tensor<5xi1>
  %14 = linalg.init_tensor [5] : tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %87 = arith.extui %arg2 : i1 to i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%15, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.muli %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %17 = linalg.init_tensor [] : tensor<i32>
  %18 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%16 : tensor<5xi32>) outs(%18 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %87 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %19 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.subi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %21 = linalg.init_tensor [] : tensor<i1>
  %22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %87 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %87 : i1
  } -> tensor<i1>
  %23 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%11 : tensor<5xf32>) outs(%9 : tensor<5xf32>) : tensor<5xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %87 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %87 : i1
  } -> tensor<5xi1>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %87 = arith.extui %arg2 : i1 to i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%25, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.muli %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %27 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%26 : tensor<5xi32>) outs(%27 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %87 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %28 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.subi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_2, %29 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %87 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %31 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.subi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  cf.br ^bb1(%30, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%32: tensor<i32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %31 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %87 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %87 : i1
  } -> tensor<i1>
  %37 = tensor.extract %36[] : tensor<i1>
  cf.cond_br %37, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %cst_6 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.addi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %39 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%39 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %87 = arith.index_cast %arg2 : i32 to index
    %88 = linalg.index 0 : index
    %89 = linalg.index 1 : index
    %90 = tensor.extract %8[%87, %88, %89] : tensor<5x1x1xf32>
    linalg.yield %90 : f32
  } -> tensor<1x1xf32>
  %41 = tensor.collapse_shape %40 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %42 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<1xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %44 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%44 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %87 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %87 : i1
  } -> tensor<1x10xi1>
  %46 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%46 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %87 = arith.index_cast %arg2 : i32 to index
    %88 = linalg.index 0 : index
    %89 = linalg.index 1 : index
    %90 = tensor.extract %5[%87, %88, %89] : tensor<5x1x64xf32>
    linalg.yield %90 : f32
  } -> tensor<1x64xf32>
  %48 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %49 = tensor.insert_slice %47 into %48[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %50 = tensor.insert_slice %34 into %49[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %51 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %52 = linalg.fill ins(%cst_1 : f32) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %53 = linalg.matmul ins(%50, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%52 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%51 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x40xf32>
  %55 = tensor.extract_slice %54[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %33 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %61 = tensor.extract_slice %54[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %66 = tensor.extract_slice %54[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %67 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60, %68 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.minf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %33, %71 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %87 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %73 = tensor.extract_slice %54[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %34, %79 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %87 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %81 = tensor.expand_shape %80 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %82 = tensor.extract %32[] : tensor<i32>
  %83 = arith.maxsi %82, %c0_i32 : i32
  %84 = arith.minsi %83, %c4_i32 : i32
  %85 = arith.index_cast %84 : i32 to index
  %86 = tensor.insert_slice %81 into %35[%85, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%38, %72, %80, %86 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %35 : tensor<5x1x10xf32>
}

// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant 0.000000e+00 : f32
  %cst_2 = arith.constant dense<0> : tensor<i32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
  %cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
  %cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
  %cst_6 = arith.constant dense<1> : tensor<i32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_9 = arith.constant dense<5> : tensor<i32>
  %cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  %1 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %2 = linalg.fill ins(%cst_1 : f32) outs(%1 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %3 = tensor.insert_slice %0 into %2[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<1x5x4xf32> into tensor<1x5x64xf32>
  %4 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x5x64xf32>) outs(%4 : tensor<5x1x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1x64xf32>
  %6 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%6 : tensor<5x1xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x1xf32>
  %8 = tensor.expand_shape %7 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
  %9 = linalg.init_tensor [5] : tensor<5xf32>
  %10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<5xf32>) -> tensor<5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8 : tensor<5x1x1xf32>) outs(%10 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = arith.minf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<5xf32>
  %12 = linalg.init_tensor [5] : tensor<5xi1>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %87 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %87 : i1
  } -> tensor<5xi1>
  %14 = linalg.init_tensor [5] : tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %87 = arith.extui %arg2 : i1 to i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%15, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.muli %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %17 = linalg.init_tensor [] : tensor<i32>
  %18 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%16 : tensor<5xi32>) outs(%18 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %87 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %19 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.subi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %21 = linalg.init_tensor [] : tensor<i1>
  %22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %87 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %87 : i1
  } -> tensor<i1>
  %23 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%11 : tensor<5xf32>) outs(%9 : tensor<5xf32>) : tensor<5xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %87 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %87 : i1
  } -> tensor<5xi1>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %87 = arith.extui %arg2 : i1 to i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%25, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.muli %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<5xi32>
  %27 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%26 : tensor<5xi32>) outs(%27 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %87 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %28 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.subi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_2, %29 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %87 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %31 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.subi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  cf.br ^bb1(%30, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%32: tensor<i32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %31 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %87 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %87 : i1
  } -> tensor<i1>
  %37 = tensor.extract %36[] : tensor<i1>
  cf.cond_br %37, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %cst_6 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %87 = arith.addi %arg2, %arg3 : i32
    linalg.yield %87 : i32
  } -> tensor<i32>
  %39 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%39 : tensor<1x1xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %87 = arith.index_cast %arg2 : i32 to index
    %88 = linalg.index 0 : index
    %89 = linalg.index 1 : index
    %90 = tensor.extract %8[%87, %88, %89] : tensor<5x1x1xf32>
    linalg.yield %90 : f32
  } -> tensor<1x1xf32>
  %41 = tensor.collapse_shape %40 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
  %42 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<1xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<1x10xf32>
  %44 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%44 : tensor<1x10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %87 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %87 : i1
  } -> tensor<1x10xi1>
  %46 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%46 : tensor<1x64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %87 = arith.index_cast %arg2 : i32 to index
    %88 = linalg.index 0 : index
    %89 = linalg.index 1 : index
    %90 = tensor.extract %5[%87, %88, %89] : tensor<5x1x64xf32>
    linalg.yield %90 : f32
  } -> tensor<1x64xf32>
  %48 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %49 = tensor.insert_slice %47 into %48[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
  %50 = tensor.insert_slice %34 into %49[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
  %51 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %52 = linalg.fill ins(%cst_1 : f32) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %53 = linalg.matmul ins(%50, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%52 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%51 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x40xf32>
  %55 = tensor.extract_slice %54[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %33 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %61 = tensor.extract_slice %54[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %66 = tensor.extract_slice %54[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %67 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60, %68 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.minf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %33, %71 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %87 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %73 = tensor.extract_slice %54[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
  %74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.addf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %87 = math.tanh %arg2 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %87 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %34, %79 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %87 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %87 : f32
  } -> tensor<1x10xf32>
  %81 = tensor.expand_shape %80 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
  %82 = tensor.extract %32[] : tensor<i32>
  %83 = arith.maxsi %82, %c0_i32 : i32
  %84 = arith.minsi %83, %c4_i32 : i32
  %85 = arith.index_cast %84 : i32 to index
  %86 = tensor.insert_slice %81 into %35[%85, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%38, %72, %80, %86 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %35 : tensor<5x1x10xf32>
}

// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
  %cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_4 = arith.constant 0x7F800000 : f32
  %cst_5 = arith.constant 0.000000e+00 : f32
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %cst_7 = arith.constant dense<1> : tensor<i32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_10 = arith.constant dense<5> : tensor<i32>
  %cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = linalg.init_tensor [5, 64] : tensor<5x64xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<5x64xf32>) outs(%5 : tensor<5x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x64xf32>
  %7 = tensor.expand_shape %6 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %8 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %9 = linalg.init_tensor [5] : tensor<5xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%8 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5xf32>
  %11 = tensor.expand_shape %10 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %12 = linalg.init_tensor [5] : tensor<5xf32>
  %13 = linalg.fill ins(%cst_4 : f32) outs(%12 : tensor<5xf32>) -> tensor<5xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<5xf32>) outs(%13 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = arith.minf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<5xf32>
  %15 = linalg.init_tensor [5] : tensor<5xi1>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<5xi1>
  %17 = linalg.init_tensor [5] : tensor<5xi32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%16 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %119 = arith.extui %arg2 : i1 to i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%18, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.muli %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %20 = linalg.init_tensor [] : tensor<i32>
  %21 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%19 : tensor<5xi32>) outs(%21 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %119 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %23 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %22 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %24 = linalg.init_tensor [] : tensor<i1>
  %25 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%23, %cst_10 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %119 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %119 : i1
  } -> tensor<i1>
  %26 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%14 : tensor<5xf32>) outs(%12 : tensor<5xf32>) : tensor<5xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<5xi1>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%27 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %119 = arith.extui %arg2 : i1 to i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %29 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.muli %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %30 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
  %31 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%29 : tensor<5xi32>) outs(%30 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %119 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %32 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %31 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%25, %cst_6, %32 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %119 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %23 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  cf.br ^bb1(%33, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%35: tensor<i32>, %36: tensor<1x10xf32>, %37: tensor<1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %34 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %119 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %119 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %41 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %cst_7 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.addi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %42 = linalg.init_tensor [] : tensor<f32>
  %43 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%42 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %119 = arith.index_cast %arg2 : i32 to index
    %120 = tensor.extract %11[%119, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %120 : f32
  } -> tensor<f32>
  %44 = linalg.init_tensor [10] : tensor<10xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43 : tensor<f32>) outs(%44 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<10xf32>
  %46 = linalg.init_tensor [10] : tensor<10xi1>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45, %cst : tensor<10xf32>, tensor<10xf32>) outs(%46 : tensor<10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<10xi1>
  %48 = linalg.init_tensor [64] : tensor<64xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35 : tensor<i32>) outs(%48 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %119 = arith.index_cast %arg2 : i32 to index
    %120 = linalg.index 0 : index
    %121 = tensor.extract %7[%119, %c0, %120] : tensor<5x1x64xf32>
    linalg.yield %121 : f32
  } -> tensor<64xf32>
  %50 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %51 = tensor.insert_slice %49 into %50[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %52 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %53 = tensor.insert_slice %52 into %51[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %54 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %55 = linalg.fill ins(%cst_5 : f32) outs(%54 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %56 = linalg.matmul ins(%53, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%55 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %57 = tensor.collapse_shape %56 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
  %58 = linalg.init_tensor [40] : tensor<40xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%58 : tensor<40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<40xf32>
  %60 = tensor.expand_shape %59 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %61 = tensor.extract_slice %60[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %62 = linalg.init_tensor [10] : tensor<10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %64 = linalg.init_tensor [10] : tensor<10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63 : tensor<10xf32>) outs(%64 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %66 = linalg.init_tensor [10] : tensor<10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%66 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %68 = linalg.init_tensor [10] : tensor<10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%68 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %70 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %71 = linalg.init_tensor [10] : tensor<10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %70 : tensor<10xf32>, tensor<10xf32>) outs(%71 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %73 = tensor.extract_slice %60[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %74 = linalg.init_tensor [10] : tensor<10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %76 = linalg.init_tensor [10] : tensor<10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75 : tensor<10xf32>) outs(%76 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %78 = linalg.init_tensor [10] : tensor<10xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%78 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %80 = linalg.init_tensor [10] : tensor<10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%80 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %82 = tensor.extract_slice %60[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %83 = linalg.init_tensor [10] : tensor<10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%82 : tensor<10xf32>) outs(%83 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %85 = linalg.init_tensor [10] : tensor<10xf32>
  %86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %84 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %87 = linalg.init_tensor [10] : tensor<10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %86 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %89 = linalg.init_tensor [10] : tensor<10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%89 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.minf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %91 = linalg.init_tensor [10] : tensor<10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%90, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%91 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %93 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %94 = linalg.init_tensor [10] : tensor<10xf32>
  %95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %93, %92 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %119 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %96 = tensor.expand_shape %95 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %97 = tensor.extract_slice %60[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %98 = linalg.init_tensor [10] : tensor<10xf32>
  %99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %100 = linalg.init_tensor [10] : tensor<10xf32>
  %101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99 : tensor<10xf32>) outs(%100 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %102 = linalg.init_tensor [10] : tensor<10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%102 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %104 = linalg.init_tensor [10] : tensor<10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%103, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %106 = linalg.init_tensor [10] : tensor<10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%92 : tensor<10xf32>) outs(%106 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %108 = linalg.init_tensor [10] : tensor<10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105, %107 : tensor<10xf32>, tensor<10xf32>) outs(%108 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %110 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %111 = linalg.init_tensor [10] : tensor<10xf32>
  %112 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %110, %109 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%111 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %119 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %113 = tensor.expand_shape %112 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %114 = tensor.extract %35[] : tensor<i32>
  %115 = arith.maxsi %114, %c0_i32 : i32
  %116 = arith.minsi %115, %c4_i32 : i32
  %117 = arith.index_cast %116 : i32 to index
  %118 = tensor.insert_slice %112 into %38[%117, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%41, %96, %113, %118 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %38 : tensor<5x1x10xf32>
}

// -----// IR Dump After InterchangeGenericOps //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
  %cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_4 = arith.constant 0x7F800000 : f32
  %cst_5 = arith.constant 0.000000e+00 : f32
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %cst_7 = arith.constant dense<1> : tensor<i32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_10 = arith.constant dense<5> : tensor<i32>
  %cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = linalg.init_tensor [5, 64] : tensor<5x64xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<5x64xf32>) outs(%5 : tensor<5x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x64xf32>
  %7 = tensor.expand_shape %6 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %8 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %9 = linalg.init_tensor [5] : tensor<5xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%8 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5xf32>
  %11 = tensor.expand_shape %10 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %12 = linalg.init_tensor [5] : tensor<5xf32>
  %13 = linalg.fill ins(%cst_4 : f32) outs(%12 : tensor<5xf32>) -> tensor<5xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<5xf32>) outs(%13 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = arith.minf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<5xf32>
  %15 = linalg.init_tensor [5] : tensor<5xi1>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<5xi1>
  %17 = linalg.init_tensor [5] : tensor<5xi32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%16 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %119 = arith.extui %arg2 : i1 to i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%18, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.muli %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %20 = linalg.init_tensor [] : tensor<i32>
  %21 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%19 : tensor<5xi32>) outs(%21 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %119 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %23 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %22 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %24 = linalg.init_tensor [] : tensor<i1>
  %25 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%23, %cst_10 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %119 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %119 : i1
  } -> tensor<i1>
  %26 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%14 : tensor<5xf32>) outs(%12 : tensor<5xf32>) : tensor<5xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<5xi1>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%27 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %119 = arith.extui %arg2 : i1 to i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %29 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.muli %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %30 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
  %31 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%29 : tensor<5xi32>) outs(%30 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %119 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %32 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %31 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%25, %cst_6, %32 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %119 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %23 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  cf.br ^bb1(%33, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%35: tensor<i32>, %36: tensor<1x10xf32>, %37: tensor<1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %34 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %119 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %119 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %41 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %cst_7 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.addi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %42 = linalg.init_tensor [] : tensor<f32>
  %43 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%42 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %119 = arith.index_cast %arg2 : i32 to index
    %120 = tensor.extract %11[%119, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %120 : f32
  } -> tensor<f32>
  %44 = linalg.init_tensor [10] : tensor<10xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43 : tensor<f32>) outs(%44 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<10xf32>
  %46 = linalg.init_tensor [10] : tensor<10xi1>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45, %cst : tensor<10xf32>, tensor<10xf32>) outs(%46 : tensor<10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<10xi1>
  %48 = linalg.init_tensor [64] : tensor<64xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35 : tensor<i32>) outs(%48 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %119 = arith.index_cast %arg2 : i32 to index
    %120 = linalg.index 0 : index
    %121 = tensor.extract %7[%119, %c0, %120] : tensor<5x1x64xf32>
    linalg.yield %121 : f32
  } -> tensor<64xf32>
  %50 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %51 = tensor.insert_slice %49 into %50[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %52 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %53 = tensor.insert_slice %52 into %51[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %54 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %55 = linalg.fill ins(%cst_5 : f32) outs(%54 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %56 = linalg.matmul ins(%53, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%55 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %57 = tensor.collapse_shape %56 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
  %58 = linalg.init_tensor [40] : tensor<40xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%58 : tensor<40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<40xf32>
  %60 = tensor.expand_shape %59 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %61 = tensor.extract_slice %60[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %62 = linalg.init_tensor [10] : tensor<10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %64 = linalg.init_tensor [10] : tensor<10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63 : tensor<10xf32>) outs(%64 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %66 = linalg.init_tensor [10] : tensor<10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%66 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %68 = linalg.init_tensor [10] : tensor<10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%68 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %70 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %71 = linalg.init_tensor [10] : tensor<10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %70 : tensor<10xf32>, tensor<10xf32>) outs(%71 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %73 = tensor.extract_slice %60[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %74 = linalg.init_tensor [10] : tensor<10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %76 = linalg.init_tensor [10] : tensor<10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75 : tensor<10xf32>) outs(%76 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %78 = linalg.init_tensor [10] : tensor<10xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%78 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %80 = linalg.init_tensor [10] : tensor<10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%80 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %82 = tensor.extract_slice %60[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %83 = linalg.init_tensor [10] : tensor<10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%82 : tensor<10xf32>) outs(%83 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %85 = linalg.init_tensor [10] : tensor<10xf32>
  %86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %84 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %87 = linalg.init_tensor [10] : tensor<10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %86 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %89 = linalg.init_tensor [10] : tensor<10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%89 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.minf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %91 = linalg.init_tensor [10] : tensor<10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%90, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%91 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %93 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %94 = linalg.init_tensor [10] : tensor<10xf32>
  %95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %93, %92 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %119 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %96 = tensor.expand_shape %95 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %97 = tensor.extract_slice %60[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %98 = linalg.init_tensor [10] : tensor<10xf32>
  %99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %100 = linalg.init_tensor [10] : tensor<10xf32>
  %101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99 : tensor<10xf32>) outs(%100 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %102 = linalg.init_tensor [10] : tensor<10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%102 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %104 = linalg.init_tensor [10] : tensor<10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%103, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %106 = linalg.init_tensor [10] : tensor<10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%92 : tensor<10xf32>) outs(%106 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %108 = linalg.init_tensor [10] : tensor<10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105, %107 : tensor<10xf32>, tensor<10xf32>) outs(%108 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %110 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %111 = linalg.init_tensor [10] : tensor<10xf32>
  %112 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %110, %109 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%111 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %119 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %113 = tensor.expand_shape %112 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %114 = tensor.extract %35[] : tensor<i32>
  %115 = arith.maxsi %114, %c0_i32 : i32
  %116 = arith.minsi %115, %c4_i32 : i32
  %117 = arith.index_cast %116 : i32 to index
  %118 = tensor.insert_slice %112 into %38[%117, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%41, %96, %113, %118 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %38 : tensor<5x1x10xf32>
}

// -----// IR Dump After ResolveShapedTypeResultDims //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
  %cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_4 = arith.constant 0x7F800000 : f32
  %cst_5 = arith.constant 0.000000e+00 : f32
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %cst_7 = arith.constant dense<1> : tensor<i32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_10 = arith.constant dense<5> : tensor<i32>
  %cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = linalg.init_tensor [5, 64] : tensor<5x64xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<5x64xf32>) outs(%5 : tensor<5x64xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5x64xf32>
  %7 = tensor.expand_shape %6 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %8 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %9 = linalg.init_tensor [5] : tensor<5xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%8 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<5xf32>
  %11 = tensor.expand_shape %10 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %12 = linalg.init_tensor [5] : tensor<5xf32>
  %13 = linalg.fill ins(%cst_4 : f32) outs(%12 : tensor<5xf32>) -> tensor<5xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<5xf32>) outs(%13 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = arith.minf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<5xf32>
  %15 = linalg.init_tensor [5] : tensor<5xi1>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<5xi1>
  %17 = linalg.init_tensor [5] : tensor<5xi32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%16 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %119 = arith.extui %arg2 : i1 to i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%18, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.muli %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %20 = linalg.init_tensor [] : tensor<i32>
  %21 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%19 : tensor<5xi32>) outs(%21 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %119 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %23 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %22 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %24 = linalg.init_tensor [] : tensor<i1>
  %25 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%23, %cst_10 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %119 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %119 : i1
  } -> tensor<i1>
  %26 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%14 : tensor<5xf32>) outs(%12 : tensor<5xf32>) : tensor<5xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<5xi1>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%27 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %119 = arith.extui %arg2 : i1 to i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %29 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.muli %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<5xi32>
  %30 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
  %31 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%29 : tensor<5xi32>) outs(%30 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %119 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %32 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %31 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %33 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%25, %cst_6, %32 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %119 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %23 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.subi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  cf.br ^bb1(%33, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%35: tensor<i32>, %36: tensor<1x10xf32>, %37: tensor<1x10xf32>, %38: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %34 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %119 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %119 : i1
  } -> tensor<i1>
  %40 = tensor.extract %39[] : tensor<i1>
  cf.cond_br %40, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %41 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %cst_7 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %119 = arith.addi %arg2, %arg3 : i32
    linalg.yield %119 : i32
  } -> tensor<i32>
  %42 = linalg.init_tensor [] : tensor<f32>
  %43 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%42 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %119 = arith.index_cast %arg2 : i32 to index
    %120 = tensor.extract %11[%119, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %120 : f32
  } -> tensor<f32>
  %44 = linalg.init_tensor [10] : tensor<10xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43 : tensor<f32>) outs(%44 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<10xf32>
  %46 = linalg.init_tensor [10] : tensor<10xi1>
  %47 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45, %cst : tensor<10xf32>, tensor<10xf32>) outs(%46 : tensor<10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %119 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %119 : i1
  } -> tensor<10xi1>
  %48 = linalg.init_tensor [64] : tensor<64xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35 : tensor<i32>) outs(%48 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %119 = arith.index_cast %arg2 : i32 to index
    %120 = linalg.index 0 : index
    %121 = tensor.extract %7[%119, %c0, %120] : tensor<5x1x64xf32>
    linalg.yield %121 : f32
  } -> tensor<64xf32>
  %50 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %51 = tensor.insert_slice %49 into %50[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %52 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %53 = tensor.insert_slice %52 into %51[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %54 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %55 = linalg.fill ins(%cst_5 : f32) outs(%54 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %56 = linalg.matmul ins(%53, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%55 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %57 = tensor.collapse_shape %56 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
  %58 = linalg.init_tensor [40] : tensor<40xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%58 : tensor<40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<40xf32>
  %60 = tensor.expand_shape %59 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %61 = tensor.extract_slice %60[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %62 = linalg.init_tensor [10] : tensor<10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %64 = linalg.init_tensor [10] : tensor<10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63 : tensor<10xf32>) outs(%64 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %66 = linalg.init_tensor [10] : tensor<10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%66 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %68 = linalg.init_tensor [10] : tensor<10xf32>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%68 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %70 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %71 = linalg.init_tensor [10] : tensor<10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %70 : tensor<10xf32>, tensor<10xf32>) outs(%71 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %73 = tensor.extract_slice %60[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %74 = linalg.init_tensor [10] : tensor<10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %76 = linalg.init_tensor [10] : tensor<10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75 : tensor<10xf32>) outs(%76 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %78 = linalg.init_tensor [10] : tensor<10xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%78 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %80 = linalg.init_tensor [10] : tensor<10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%80 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %82 = tensor.extract_slice %60[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %83 = linalg.init_tensor [10] : tensor<10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%82 : tensor<10xf32>) outs(%83 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %85 = linalg.init_tensor [10] : tensor<10xf32>
  %86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %84 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %87 = linalg.init_tensor [10] : tensor<10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %86 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %89 = linalg.init_tensor [10] : tensor<10xf32>
  %90 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%89 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.minf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %91 = linalg.init_tensor [10] : tensor<10xf32>
  %92 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%90, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%91 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %93 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %94 = linalg.init_tensor [10] : tensor<10xf32>
  %95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %93, %92 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %119 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %96 = tensor.expand_shape %95 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %97 = tensor.extract_slice %60[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %98 = linalg.init_tensor [10] : tensor<10xf32>
  %99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %100 = linalg.init_tensor [10] : tensor<10xf32>
  %101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99 : tensor<10xf32>) outs(%100 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %102 = linalg.init_tensor [10] : tensor<10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%102 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %104 = linalg.init_tensor [10] : tensor<10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%103, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.addf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %106 = linalg.init_tensor [10] : tensor<10xf32>
  %107 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%92 : tensor<10xf32>) outs(%106 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %119 = math.tanh %arg2 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %108 = linalg.init_tensor [10] : tensor<10xf32>
  %109 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105, %107 : tensor<10xf32>, tensor<10xf32>) outs(%108 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %119 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %110 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %111 = linalg.init_tensor [10] : tensor<10xf32>
  %112 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %110, %109 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%111 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %119 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %119 : f32
  } -> tensor<10xf32>
  %113 = tensor.expand_shape %112 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %114 = tensor.extract %35[] : tensor<i32>
  %115 = arith.maxsi %114, %c0_i32 : i32
  %116 = arith.minsi %115, %c4_i32 : i32
  %117 = arith.index_cast %116 : i32 to index
  %118 = tensor.insert_slice %112 into %38[%117, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%41, %96, %113, %118 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %38 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
  %cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_4 = arith.constant 0x7F800000 : f32
  %cst_5 = arith.constant 0.000000e+00 : f32
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %cst_7 = arith.constant dense<1> : tensor<i32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_10 = arith.constant dense<5> : tensor<i32>
  %cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.fill ins(%cst_4 : f32) outs(%8 : tensor<5xf32>) -> tensor<5xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %115 = arith.minf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<5xf32>
  %11 = linalg.init_tensor [5] : tensor<5xi1>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %115 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %115 : i1
  } -> tensor<5xi1>
  %13 = linalg.init_tensor [5] : tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%12 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %115 = arith.extui %arg2 : i1 to i32
    linalg.yield %115 : i32
  } -> tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %115 = arith.muli %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<5xi32>
  %16 = linalg.init_tensor [] : tensor<i32>
  %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %115 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  %19 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %18 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %115 = arith.subi %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  %20 = linalg.init_tensor [] : tensor<i1>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%19, %cst_10 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %115 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %115 : i1
  } -> tensor<i1>
  %22 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%10 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %115 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %115 : i1
  } -> tensor<5xi1>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %115 = arith.extui %arg2 : i1 to i32
    linalg.yield %115 : i32
  } -> tensor<5xi32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %115 = arith.muli %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<5xi32>
  %26 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%25 : tensor<5xi32>) outs(%26 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %115 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %27 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %115 = arith.subi %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%21, %cst_6, %28 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %115 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %19 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %115 = arith.subi %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  cf.br ^bb1(%29, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%31: tensor<i32>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %30 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %115 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %115 : i1
  } -> tensor<i1>
  %36 = tensor.extract %35[] : tensor<i1>
  cf.cond_br %36, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %cst_7 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %115 = arith.addi %arg2, %arg3 : i32
    linalg.yield %115 : i32
  } -> tensor<i32>
  %38 = linalg.init_tensor [] : tensor<f32>
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31 : tensor<i32>) outs(%38 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %115 = arith.index_cast %arg2 : i32 to index
    %116 = tensor.extract %7[%115, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %116 : f32
  } -> tensor<f32>
  %40 = linalg.init_tensor [10] : tensor<10xf32>
  %41 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%39 : tensor<f32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<10xf32>
  %42 = linalg.init_tensor [10] : tensor<10xi1>
  %43 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%41, %cst : tensor<10xf32>, tensor<10xf32>) outs(%42 : tensor<10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %115 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %115 : i1
  } -> tensor<10xi1>
  %44 = linalg.init_tensor [64] : tensor<64xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%31 : tensor<i32>) outs(%44 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %115 = arith.index_cast %arg2 : i32 to index
    %116 = linalg.index 0 : index
    %117 = tensor.extract %5[%115, %c0, %116] : tensor<5x1x64xf32>
    linalg.yield %117 : f32
  } -> tensor<64xf32>
  %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %48 = tensor.collapse_shape %33 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %49 = tensor.insert_slice %48 into %47[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %50 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %51 = linalg.fill ins(%cst_5 : f32) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %52 = linalg.matmul ins(%49, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
  %54 = linalg.init_tensor [40] : tensor<40xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%53, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%54 : tensor<40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.addf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<40xf32>
  %56 = tensor.expand_shape %55 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %57 = tensor.extract_slice %56[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %58 = linalg.init_tensor [10] : tensor<10xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%58 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %60 = linalg.init_tensor [10] : tensor<10xf32>
  %61 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%59 : tensor<10xf32>) outs(%60 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %115 = math.tanh %arg2 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %62 = linalg.init_tensor [10] : tensor<10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %64 = linalg.init_tensor [10] : tensor<10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%64 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.addf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %66 = tensor.collapse_shape %32 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %67 = linalg.init_tensor [10] : tensor<10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %66 : tensor<10xf32>, tensor<10xf32>) outs(%67 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %69 = tensor.extract_slice %56[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %70 = linalg.init_tensor [10] : tensor<10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%70 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %72 = linalg.init_tensor [10] : tensor<10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%71 : tensor<10xf32>) outs(%72 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %115 = math.tanh %arg2 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %74 = linalg.init_tensor [10] : tensor<10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %76 = linalg.init_tensor [10] : tensor<10xf32>
  %77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%76 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.addf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %78 = tensor.extract_slice %56[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %79 = linalg.init_tensor [10] : tensor<10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%78 : tensor<10xf32>) outs(%79 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %115 = math.tanh %arg2 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %81 = linalg.init_tensor [10] : tensor<10xf32>
  %82 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %80 : tensor<10xf32>, tensor<10xf32>) outs(%81 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %83 = linalg.init_tensor [10] : tensor<10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%68, %82 : tensor<10xf32>, tensor<10xf32>) outs(%83 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.addf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %85 = linalg.init_tensor [10] : tensor<10xf32>
  %86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%84, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.minf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %87 = linalg.init_tensor [10] : tensor<10xf32>
  %88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%86, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %89 = tensor.collapse_shape %32 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %90 = linalg.init_tensor [10] : tensor<10xf32>
  %91 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %89, %88 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%90 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %115 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %92 = tensor.expand_shape %91 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %93 = tensor.extract_slice %56[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %94 = linalg.init_tensor [10] : tensor<10xf32>
  %95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%93, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %96 = linalg.init_tensor [10] : tensor<10xf32>
  %97 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%95 : tensor<10xf32>) outs(%96 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %115 = math.tanh %arg2 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %98 = linalg.init_tensor [10] : tensor<10xf32>
  %99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %100 = linalg.init_tensor [10] : tensor<10xf32>
  %101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%100 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.addf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %102 = linalg.init_tensor [10] : tensor<10xf32>
  %103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88 : tensor<10xf32>) outs(%102 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %115 = math.tanh %arg2 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %104 = linalg.init_tensor [10] : tensor<10xf32>
  %105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %103 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %115 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %106 = tensor.collapse_shape %33 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %107 = linalg.init_tensor [10] : tensor<10xf32>
  %108 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %106, %105 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%107 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %115 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %115 : f32
  } -> tensor<10xf32>
  %109 = tensor.expand_shape %108 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %110 = tensor.extract %31[] : tensor<i32>
  %111 = arith.maxsi %110, %c0_i32 : i32
  %112 = arith.minsi %111, %c4_i32 : i32
  %113 = arith.index_cast %112 : i32 to index
  %114 = tensor.insert_slice %108 into %34[%113, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%37, %92, %109, %114 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %34 : tensor<5x1x10xf32>
}

// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
  %cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
  %c0 = arith.constant 0 : index
  %cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
  %cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
  %cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
  %c4_i32 = arith.constant 4 : i32
  %c0_i32 = arith.constant 0 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_4 = arith.constant 0x7F800000 : f32
  %cst_5 = arith.constant 0.000000e+00 : f32
  %cst_6 = arith.constant dense<0> : tensor<i32>
  %cst_7 = arith.constant dense<1> : tensor<i32>
  %cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_10 = arith.constant dense<5> : tensor<i32>
  %cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.fill ins(%cst_4 : f32) outs(%8 : tensor<5xf32>) -> tensor<5xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %91 = arith.minf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<5xf32>
  %11 = linalg.init_tensor [5] : tensor<5xi1>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %91 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %91 : i1
  } -> tensor<5xi1>
  %13 = linalg.init_tensor [5] : tensor<5xi32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%12 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %91 = arith.extui %arg2 : i1 to i32
    linalg.yield %91 : i32
  } -> tensor<5xi32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %91 = arith.muli %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<5xi32>
  %16 = linalg.init_tensor [] : tensor<i32>
  %17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %91 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  %19 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %18 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %91 = arith.subi %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  %20 = linalg.init_tensor [] : tensor<i1>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%19, %cst_10 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %91 = arith.cmpi eq, %arg2, %arg3 : i32
    linalg.yield %91 : i1
  } -> tensor<i1>
  %22 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%10 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %91 = arith.cmpf oeq, %arg2, %arg3 : f32
    linalg.yield %91 : i1
  } -> tensor<5xi1>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i1, %arg3: i32):
    %91 = arith.extui %arg2 : i1 to i32
    linalg.yield %91 : i32
  } -> tensor<5xi32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %91 = arith.muli %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<5xi32>
  %26 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%25 : tensor<5xi32>) outs(%26 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %91 = arith.maxsi %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %27 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %91 = arith.subi %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  %29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%21, %cst_6, %28 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
    %91 = arith.select %arg2, %arg3, %arg4 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %19 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %91 = arith.subi %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  cf.br ^bb1(%29, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%31: tensor<i32>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %30 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %91 = arith.cmpi slt, %arg2, %arg3 : i32
    linalg.yield %91 : i1
  } -> tensor<i1>
  %36 = tensor.extract %35[] : tensor<i1>
  cf.cond_br %36, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %cst_7 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %91 = arith.addi %arg2, %arg3 : i32
    linalg.yield %91 : i32
  } -> tensor<i32>
  %38 = linalg.init_tensor [] : tensor<f32>
  %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31 : tensor<i32>) outs(%38 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %91 = arith.index_cast %arg2 : i32 to index
    %92 = tensor.extract %7[%91, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %92 : f32
  } -> tensor<f32>
  %40 = linalg.init_tensor [10] : tensor<10xf32>
  %41 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%39 : tensor<f32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    linalg.yield %arg2 : f32
  } -> tensor<10xf32>
  %42 = linalg.init_tensor [10] : tensor<10xi1>
  %43 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%41, %cst : tensor<10xf32>, tensor<10xf32>) outs(%42 : tensor<10xi1>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
    %91 = arith.cmpf ogt, %arg2, %arg3 : f32
    linalg.yield %91 : i1
  } -> tensor<10xi1>
  %44 = linalg.init_tensor [64] : tensor<64xf32>
  %45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%31 : tensor<i32>) outs(%44 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %91 = arith.index_cast %arg2 : i32 to index
    %92 = linalg.index 0 : index
    %93 = tensor.extract %5[%91, %c0, %92] : tensor<5x1x64xf32>
    linalg.yield %93 : f32
  } -> tensor<64xf32>
  %46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %48 = tensor.collapse_shape %33 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %49 = tensor.insert_slice %48 into %47[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %50 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %51 = linalg.fill ins(%cst_5 : f32) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %52 = linalg.matmul ins(%49, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
  %54 = linalg.init_tensor [40] : tensor<40xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%53, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%54 : tensor<40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.addf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<40xf32>
  %56 = tensor.expand_shape %55 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
  %57 = tensor.extract_slice %56[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %58 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%58 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %91 = math.tanh %arg2 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %60 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%59, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %61 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%60, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.addf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %62 = tensor.collapse_shape %32 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %62 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %64 = tensor.extract_slice %56[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%64, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %66 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %91 = math.tanh %arg2 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%66, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %68 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.addf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %69 = tensor.extract_slice %56[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %70 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %91 = math.tanh %arg2 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %71 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%68, %70 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63, %71 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.addf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %73 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.minf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %74 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.maxf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %62, %74 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %91 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %76 = tensor.expand_shape %75 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %77 = tensor.extract_slice %56[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %78 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%78 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %91 = math.tanh %arg2 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %80 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%80, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.addf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %82 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%74 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %91 = math.tanh %arg2 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %83 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %82 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %91 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %48, %83 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
  ^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
    %91 = arith.select %arg2, %arg3, %arg4 : f32
    linalg.yield %91 : f32
  } -> tensor<10xf32>
  %85 = tensor.expand_shape %84 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %86 = tensor.extract %31[] : tensor<i32>
  %87 = arith.maxsi %86, %c0_i32 : i32
  %88 = arith.minsi %87, %c4_i32 : i32
  %89 = arith.index_cast %88 : i32 to index
  %90 = tensor.insert_slice %84 into %34[%89, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%37, %76, %85, %90 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %34 : tensor<5x1x10xf32>
}

// -----// IR Dump After FusionOfTensorOps //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c5_i32 = arith.constant 5 : i32
  %c0_i32 = arith.constant 0 : i32
  %c1_i32 = arith.constant 1 : i32
  %cst_0 = arith.constant 1.000000e+01 : f32
  %cst_1 = arith.constant -1.000000e+01 : f32
  %cst_2 = arith.constant 5.000000e-01 : f32
  %c0 = arith.constant 0 : index
  %c4_i32 = arith.constant 4 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_3 = arith.constant 0x7F800000 : f32
  %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.fill ins(%cst_3 : f32) outs(%8 : tensor<5xf32>) -> tensor<5xf32>
  %10 = tensor.expand_shape %9 [[0, 1]] : tensor<5xf32> into tensor<1x5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %62 = arith.minf %arg2, %arg3 : f32
    linalg.yield %62 : f32
  } -> tensor<1x5xf32>
  %12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %13 = linalg.init_tensor [] : tensor<i32>
  %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %62 = arith.cmpf oeq, %arg2, %cst : f32
    %63 = arith.extui %62 : i1 to i32
    %64 = arith.muli %63, %arg3 : i32
    %65 = arith.maxsi %64, %arg4 : i32
    linalg.yield %65 : i32
  } -> tensor<i32>
  %16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %62 = arith.subi %c5_i32, %arg2 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %17 = linalg.init_tensor [] : tensor<i1>
  %18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %62 = arith.cmpf oeq, %arg2, %cst : f32
    %63 = arith.extui %62 : i1 to i32
    %64 = arith.muli %63, %arg3 : i32
    %65 = arith.maxsi %64, %arg4 : i32
    linalg.yield %65 : i32
  } -> tensor<i32>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %62 = arith.subi %c5_i32, %arg3 : i32
    %63 = arith.cmpi eq, %arg2, %c5_i32 : i32
    %64 = arith.select %63, %c0_i32, %62 : i32
    linalg.yield %64 : i32
  } -> tensor<i32>
  cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %62 = arith.subi %c5_i32, %arg3 : i32
    %63 = arith.cmpi slt, %arg2, %62 : i32
    linalg.yield %63 : i1
  } -> tensor<i1>
  %27 = tensor.extract %26[] : tensor<i1>
  cf.cond_br %27, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %62 = arith.addi %arg2, %c1_i32 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %29 = linalg.init_tensor [] : tensor<f32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %62 = arith.index_cast %arg2 : i32 to index
    %63 = tensor.extract %7[%62, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %63 : f32
  } -> tensor<f32>
  %31 = linalg.init_tensor [64] : tensor<64xf32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %62 = arith.index_cast %arg2 : i32 to index
    %63 = linalg.index 0 : index
    %64 = tensor.extract %5[%62, %c0, %63] : tensor<5x1x64xf32>
    linalg.yield %64 : f32
  } -> tensor<64xf32>
  %33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %40 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%40 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %62 = arith.addf %arg2, %cst : f32
    linalg.yield %62 : f32
  } -> tensor<1x40xf32>
  %42 = tensor.extract_slice %41[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %43 = tensor.extract_slice %41[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %44 = tensor.extract_slice %41[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %47 = tensor.expand_shape %44 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %48 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %23, %46, %47 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%48 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %62 = math.tanh %arg5 : f32
    %63 = arith.mulf %arg2, %cst_2 : f32
    %64 = math.tanh %63 : f32
    %65 = arith.mulf %64, %cst_2 : f32
    %66 = arith.addf %65, %cst_2 : f32
    %67 = arith.mulf %arg4, %cst_2 : f32
    %68 = math.tanh %67 : f32
    %69 = arith.mulf %68, %cst_2 : f32
    %70 = arith.addf %69, %cst_2 : f32
    %71 = arith.mulf %70, %62 : f32
    %72 = arith.mulf %66, %arg3 : f32
    %73 = arith.addf %72, %71 : f32
    %74 = arith.minf %73, %cst_0 : f32
    %75 = arith.maxf %74, %cst_1 : f32
    linalg.yield %75 : f32
  } -> tensor<1x10xf32>
  %50 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%50 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
    %62 = arith.cmpf ogt, %arg2, %cst : f32
    %63 = arith.select %62, %arg3, %arg4 : f32
    linalg.yield %63 : f32
  } -> tensor<1x10xf32>
  %52 = tensor.extract_slice %41[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %53 = tensor.expand_shape %52 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %53, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %62 = math.tanh %arg5 : f32
    %63 = arith.mulf %arg4, %cst_2 : f32
    %64 = math.tanh %63 : f32
    %65 = arith.mulf %64, %cst_2 : f32
    %66 = arith.addf %65, %cst_2 : f32
    %67 = arith.mulf %66, %62 : f32
    %68 = arith.cmpf ogt, %arg2, %cst : f32
    %69 = arith.select %68, %arg3, %67 : f32
    linalg.yield %69 : f32
  } -> tensor<1x10xf32>
  %56 = tensor.collapse_shape %55 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %57 = tensor.extract %22[] : tensor<i32>
  %58 = arith.maxsi %57, %c0_i32 : i32
  %59 = arith.minsi %58, %c4_i32 : i32
  %60 = arith.index_cast %59 : i32 to index
  %61 = tensor.insert_slice %56 into %25[%60, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%28, %51, %55, %61 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %25 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c5_i32 = arith.constant 5 : i32
  %c0_i32 = arith.constant 0 : i32
  %c1_i32 = arith.constant 1 : i32
  %cst_0 = arith.constant 1.000000e+01 : f32
  %cst_1 = arith.constant -1.000000e+01 : f32
  %cst_2 = arith.constant 5.000000e-01 : f32
  %c0 = arith.constant 0 : index
  %c4_i32 = arith.constant 4 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_3 = arith.constant 0x7F800000 : f32
  %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
  %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %62 = arith.minf %arg2, %arg3 : f32
    linalg.yield %62 : f32
  } -> tensor<1x5xf32>
  %12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %13 = linalg.init_tensor [] : tensor<i32>
  %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %62 = arith.cmpf oeq, %arg2, %cst : f32
    %63 = arith.extui %62 : i1 to i32
    %64 = arith.muli %63, %arg3 : i32
    %65 = arith.maxsi %64, %arg4 : i32
    linalg.yield %65 : i32
  } -> tensor<i32>
  %16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %62 = arith.subi %c5_i32, %arg2 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %17 = linalg.init_tensor [] : tensor<i1>
  %18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %62 = arith.cmpf oeq, %arg2, %cst : f32
    %63 = arith.extui %62 : i1 to i32
    %64 = arith.muli %63, %arg3 : i32
    %65 = arith.maxsi %64, %arg4 : i32
    linalg.yield %65 : i32
  } -> tensor<i32>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %62 = arith.subi %c5_i32, %arg3 : i32
    %63 = arith.cmpi eq, %arg2, %c5_i32 : i32
    %64 = arith.select %63, %c0_i32, %62 : i32
    linalg.yield %64 : i32
  } -> tensor<i32>
  cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %62 = arith.subi %c5_i32, %arg3 : i32
    %63 = arith.cmpi slt, %arg2, %62 : i32
    linalg.yield %63 : i1
  } -> tensor<i1>
  %27 = tensor.extract %26[] : tensor<i1>
  cf.cond_br %27, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %62 = arith.addi %arg2, %c1_i32 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %29 = linalg.init_tensor [] : tensor<f32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %62 = arith.index_cast %arg2 : i32 to index
    %63 = tensor.extract %7[%62, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %63 : f32
  } -> tensor<f32>
  %31 = linalg.init_tensor [64] : tensor<64xf32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %62 = arith.index_cast %arg2 : i32 to index
    %63 = linalg.index 0 : index
    %64 = tensor.extract %5[%62, %c0, %63] : tensor<5x1x64xf32>
    linalg.yield %64 : f32
  } -> tensor<64xf32>
  %33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %40 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%40 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %62 = arith.addf %arg2, %cst : f32
    linalg.yield %62 : f32
  } -> tensor<1x40xf32>
  %42 = tensor.extract_slice %41[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %43 = tensor.extract_slice %41[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %44 = tensor.extract_slice %41[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %47 = tensor.expand_shape %44 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %48 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %23, %46, %47 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%48 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %62 = math.tanh %arg5 : f32
    %63 = arith.mulf %arg2, %cst_2 : f32
    %64 = math.tanh %63 : f32
    %65 = arith.mulf %64, %cst_2 : f32
    %66 = arith.addf %65, %cst_2 : f32
    %67 = arith.mulf %arg4, %cst_2 : f32
    %68 = math.tanh %67 : f32
    %69 = arith.mulf %68, %cst_2 : f32
    %70 = arith.addf %69, %cst_2 : f32
    %71 = arith.mulf %70, %62 : f32
    %72 = arith.mulf %66, %arg3 : f32
    %73 = arith.addf %72, %71 : f32
    %74 = arith.minf %73, %cst_0 : f32
    %75 = arith.maxf %74, %cst_1 : f32
    linalg.yield %75 : f32
  } -> tensor<1x10xf32>
  %50 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%50 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
    %62 = arith.cmpf ogt, %arg2, %cst : f32
    %63 = arith.select %62, %arg3, %arg4 : f32
    linalg.yield %63 : f32
  } -> tensor<1x10xf32>
  %52 = tensor.extract_slice %41[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %53 = tensor.expand_shape %52 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %53, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %62 = math.tanh %arg5 : f32
    %63 = arith.mulf %arg4, %cst_2 : f32
    %64 = math.tanh %63 : f32
    %65 = arith.mulf %64, %cst_2 : f32
    %66 = arith.addf %65, %cst_2 : f32
    %67 = arith.mulf %66, %62 : f32
    %68 = arith.cmpf ogt, %arg2, %cst : f32
    %69 = arith.select %68, %arg3, %67 : f32
    linalg.yield %69 : f32
  } -> tensor<1x10xf32>
  %56 = tensor.collapse_shape %55 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %57 = tensor.extract %22[] : tensor<i32>
  %58 = arith.maxsi %57, %c0_i32 : i32
  %59 = arith.minsi %58, %c4_i32 : i32
  %60 = arith.index_cast %59 : i32 to index
  %61 = tensor.insert_slice %56 into %25[%60, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%28, %51, %55, %61 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %25 : tensor<5x1x10xf32>
}

// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c5_i32 = arith.constant 5 : i32
  %c0_i32 = arith.constant 0 : i32
  %c1_i32 = arith.constant 1 : i32
  %cst_0 = arith.constant 1.000000e+01 : f32
  %cst_1 = arith.constant -1.000000e+01 : f32
  %cst_2 = arith.constant 5.000000e-01 : f32
  %c0 = arith.constant 0 : index
  %c4_i32 = arith.constant 4 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_3 = arith.constant 0x7F800000 : f32
  %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
  %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %59 = arith.minf %arg2, %arg3 : f32
    linalg.yield %59 : f32
  } -> tensor<1x5xf32>
  %12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %13 = linalg.init_tensor [] : tensor<i32>
  %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %59 = arith.cmpf oeq, %arg2, %cst : f32
    %60 = arith.extui %59 : i1 to i32
    %61 = arith.muli %60, %arg3 : i32
    %62 = arith.maxsi %61, %arg4 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %59 = arith.subi %c5_i32, %arg2 : i32
    linalg.yield %59 : i32
  } -> tensor<i32>
  %17 = linalg.init_tensor [] : tensor<i1>
  %18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %59 = arith.cmpf oeq, %arg2, %cst : f32
    %60 = arith.extui %59 : i1 to i32
    %61 = arith.muli %60, %arg3 : i32
    %62 = arith.maxsi %61, %arg4 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %59 = arith.subi %c5_i32, %arg3 : i32
    %60 = arith.cmpi eq, %arg2, %c5_i32 : i32
    %61 = arith.select %60, %c0_i32, %59 : i32
    linalg.yield %61 : i32
  } -> tensor<i32>
  cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %59 = arith.subi %c5_i32, %arg3 : i32
    %60 = arith.cmpi slt, %arg2, %59 : i32
    linalg.yield %60 : i1
  } -> tensor<i1>
  %27 = tensor.extract %26[] : tensor<i1>
  cf.cond_br %27, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %59 = arith.addi %arg2, %c1_i32 : i32
    linalg.yield %59 : i32
  } -> tensor<i32>
  %29 = linalg.init_tensor [] : tensor<f32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %59 = arith.index_cast %arg2 : i32 to index
    %60 = tensor.extract %7[%59, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %60 : f32
  } -> tensor<f32>
  %31 = linalg.init_tensor [64] : tensor<64xf32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %59 = arith.index_cast %arg2 : i32 to index
    %60 = linalg.index 0 : index
    %61 = tensor.extract %5[%59, %c0, %60] : tensor<5x1x64xf32>
    linalg.yield %61 : f32
  } -> tensor<64xf32>
  %33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%37 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %59 = arith.addf %arg2, %cst : f32
    linalg.yield %59 : f32
  } -> tensor<1x40xf32>
  %41 = tensor.extract_slice %40[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %42 = tensor.extract_slice %40[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %43 = tensor.extract_slice %40[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %44 = tensor.expand_shape %41 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %47 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %23, %45, %46 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %59 = math.tanh %arg5 : f32
    %60 = arith.mulf %arg2, %cst_2 : f32
    %61 = math.tanh %60 : f32
    %62 = arith.mulf %61, %cst_2 : f32
    %63 = arith.addf %62, %cst_2 : f32
    %64 = arith.mulf %arg4, %cst_2 : f32
    %65 = math.tanh %64 : f32
    %66 = arith.mulf %65, %cst_2 : f32
    %67 = arith.addf %66, %cst_2 : f32
    %68 = arith.mulf %67, %59 : f32
    %69 = arith.mulf %63, %arg3 : f32
    %70 = arith.addf %69, %68 : f32
    %71 = arith.minf %70, %cst_0 : f32
    %72 = arith.maxf %71, %cst_1 : f32
    linalg.yield %72 : f32
  } -> tensor<1x10xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
    %59 = arith.cmpf ogt, %arg2, %cst : f32
    %60 = arith.select %59, %arg3, %arg4 : f32
    linalg.yield %60 : f32
  } -> tensor<1x10xf32>
  %50 = tensor.extract_slice %40[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %51 = tensor.expand_shape %50 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %51, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %59 = math.tanh %arg5 : f32
    %60 = arith.mulf %arg4, %cst_2 : f32
    %61 = math.tanh %60 : f32
    %62 = arith.mulf %61, %cst_2 : f32
    %63 = arith.addf %62, %cst_2 : f32
    %64 = arith.mulf %63, %59 : f32
    %65 = arith.cmpf ogt, %arg2, %cst : f32
    %66 = arith.select %65, %arg3, %64 : f32
    linalg.yield %66 : f32
  } -> tensor<1x10xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %54 = tensor.extract %22[] : tensor<i32>
  %55 = arith.maxsi %54, %c0_i32 : i32
  %56 = arith.minsi %55, %c4_i32 : i32
  %57 = arith.index_cast %56 : i32 to index
  %58 = tensor.insert_slice %53 into %25[%57, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%28, %49, %52, %58 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %25 : tensor<5x1x10xf32>
}

// -----// IR Dump After SplitReduction //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c5_i32 = arith.constant 5 : i32
  %c0_i32 = arith.constant 0 : i32
  %c1_i32 = arith.constant 1 : i32
  %cst_0 = arith.constant 1.000000e+01 : f32
  %cst_1 = arith.constant -1.000000e+01 : f32
  %cst_2 = arith.constant 5.000000e-01 : f32
  %c0 = arith.constant 0 : index
  %c4_i32 = arith.constant 4 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_3 = arith.constant 0x7F800000 : f32
  %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
  %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %59 = arith.minf %arg2, %arg3 : f32
    linalg.yield %59 : f32
  } -> tensor<1x5xf32>
  %12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %13 = linalg.init_tensor [] : tensor<i32>
  %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %59 = arith.cmpf oeq, %arg2, %cst : f32
    %60 = arith.extui %59 : i1 to i32
    %61 = arith.muli %60, %arg3 : i32
    %62 = arith.maxsi %61, %arg4 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %59 = arith.subi %c5_i32, %arg2 : i32
    linalg.yield %59 : i32
  } -> tensor<i32>
  %17 = linalg.init_tensor [] : tensor<i1>
  %18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %59 = arith.cmpf oeq, %arg2, %cst : f32
    %60 = arith.extui %59 : i1 to i32
    %61 = arith.muli %60, %arg3 : i32
    %62 = arith.maxsi %61, %arg4 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %59 = arith.subi %c5_i32, %arg3 : i32
    %60 = arith.cmpi eq, %arg2, %c5_i32 : i32
    %61 = arith.select %60, %c0_i32, %59 : i32
    linalg.yield %61 : i32
  } -> tensor<i32>
  cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %59 = arith.subi %c5_i32, %arg3 : i32
    %60 = arith.cmpi slt, %arg2, %59 : i32
    linalg.yield %60 : i1
  } -> tensor<i1>
  %27 = tensor.extract %26[] : tensor<i1>
  cf.cond_br %27, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %59 = arith.addi %arg2, %c1_i32 : i32
    linalg.yield %59 : i32
  } -> tensor<i32>
  %29 = linalg.init_tensor [] : tensor<f32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %59 = arith.index_cast %arg2 : i32 to index
    %60 = tensor.extract %7[%59, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %60 : f32
  } -> tensor<f32>
  %31 = linalg.init_tensor [64] : tensor<64xf32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %59 = arith.index_cast %arg2 : i32 to index
    %60 = linalg.index 0 : index
    %61 = tensor.extract %5[%59, %c0, %60] : tensor<5x1x64xf32>
    linalg.yield %61 : f32
  } -> tensor<64xf32>
  %33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%37 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %59 = arith.addf %arg2, %cst : f32
    linalg.yield %59 : f32
  } -> tensor<1x40xf32>
  %41 = tensor.extract_slice %40[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %42 = tensor.extract_slice %40[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %43 = tensor.extract_slice %40[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %44 = tensor.expand_shape %41 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %47 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %23, %45, %46 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %59 = math.tanh %arg5 : f32
    %60 = arith.mulf %arg2, %cst_2 : f32
    %61 = math.tanh %60 : f32
    %62 = arith.mulf %61, %cst_2 : f32
    %63 = arith.addf %62, %cst_2 : f32
    %64 = arith.mulf %arg4, %cst_2 : f32
    %65 = math.tanh %64 : f32
    %66 = arith.mulf %65, %cst_2 : f32
    %67 = arith.addf %66, %cst_2 : f32
    %68 = arith.mulf %67, %59 : f32
    %69 = arith.mulf %63, %arg3 : f32
    %70 = arith.addf %69, %68 : f32
    %71 = arith.minf %70, %cst_0 : f32
    %72 = arith.maxf %71, %cst_1 : f32
    linalg.yield %72 : f32
  } -> tensor<1x10xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
    %59 = arith.cmpf ogt, %arg2, %cst : f32
    %60 = arith.select %59, %arg3, %arg4 : f32
    linalg.yield %60 : f32
  } -> tensor<1x10xf32>
  %50 = tensor.extract_slice %40[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %51 = tensor.expand_shape %50 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %51, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %59 = math.tanh %arg5 : f32
    %60 = arith.mulf %arg4, %cst_2 : f32
    %61 = math.tanh %60 : f32
    %62 = arith.mulf %61, %cst_2 : f32
    %63 = arith.addf %62, %cst_2 : f32
    %64 = arith.mulf %63, %59 : f32
    %65 = arith.cmpf ogt, %arg2, %cst : f32
    %66 = arith.select %65, %arg3, %64 : f32
    linalg.yield %66 : f32
  } -> tensor<1x10xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %54 = tensor.extract %22[] : tensor<i32>
  %55 = arith.maxsi %54, %c0_i32 : i32
  %56 = arith.minsi %55, %c4_i32 : i32
  %57 = arith.index_cast %56 : i32 to index
  %58 = tensor.insert_slice %53 into %25[%57, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%28, %49, %52, %58 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %25 : tensor<5x1x10xf32>
}

// -----// IR Dump After InterchangeGenericOps //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %cst = arith.constant 0.000000e+00 : f32
  %c5_i32 = arith.constant 5 : i32
  %c0_i32 = arith.constant 0 : i32
  %c1_i32 = arith.constant 1 : i32
  %cst_0 = arith.constant 1.000000e+01 : f32
  %cst_1 = arith.constant -1.000000e+01 : f32
  %cst_2 = arith.constant 5.000000e-01 : f32
  %c0 = arith.constant 0 : index
  %c4_i32 = arith.constant 4 : i32
  %c-2147483648_i32 = arith.constant -2147483648 : i32
  %cst_3 = arith.constant 0x7F800000 : f32
  %cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
  %cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
  %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
  %2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
  %3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
  %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
  %5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
  %6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
  %8 = linalg.init_tensor [5] : tensor<5xf32>
  %9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
  %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %59 = arith.minf %arg2, %arg3 : f32
    linalg.yield %59 : f32
  } -> tensor<1x5xf32>
  %12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
  %13 = linalg.init_tensor [] : tensor<i32>
  %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %59 = arith.cmpf oeq, %arg2, %cst : f32
    %60 = arith.extui %59 : i1 to i32
    %61 = arith.muli %60, %arg3 : i32
    %62 = arith.maxsi %61, %arg4 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %59 = arith.subi %c5_i32, %arg2 : i32
    linalg.yield %59 : i32
  } -> tensor<i32>
  %17 = linalg.init_tensor [] : tensor<i1>
  %18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
  %19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
  ^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
    %59 = arith.cmpf oeq, %arg2, %cst : f32
    %60 = arith.extui %59 : i1 to i32
    %61 = arith.muli %60, %arg3 : i32
    %62 = arith.maxsi %61, %arg4 : i32
    linalg.yield %62 : i32
  } -> tensor<i32>
  %21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
    %59 = arith.subi %c5_i32, %arg3 : i32
    %60 = arith.cmpi eq, %arg2, %c5_i32 : i32
    %61 = arith.select %60, %c0_i32, %59 : i32
    linalg.yield %61 : i32
  } -> tensor<i32>
  cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
  ^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
    %59 = arith.subi %c5_i32, %arg3 : i32
    %60 = arith.cmpi slt, %arg2, %59 : i32
    linalg.yield %60 : i1
  } -> tensor<i1>
  %27 = tensor.extract %26[] : tensor<i1>
  cf.cond_br %27, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
  ^bb0(%arg2: i32, %arg3: i32):
    %59 = arith.addi %arg2, %c1_i32 : i32
    linalg.yield %59 : i32
  } -> tensor<i32>
  %29 = linalg.init_tensor [] : tensor<f32>
  %30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %59 = arith.index_cast %arg2 : i32 to index
    %60 = tensor.extract %7[%59, %c0, %c0] : tensor<5x1x1xf32>
    linalg.yield %60 : f32
  } -> tensor<f32>
  %31 = linalg.init_tensor [64] : tensor<64xf32>
  %32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
  ^bb0(%arg2: i32, %arg3: f32):
    %59 = arith.index_cast %arg2 : i32 to index
    %60 = linalg.index 0 : index
    %61 = tensor.extract %5[%59, %c0, %60] : tensor<5x1x64xf32>
    linalg.yield %61 : f32
  } -> tensor<64xf32>
  %33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
  %35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
  %37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
  %38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%37 : tensor<1x40xf32>) {
  ^bb0(%arg2: f32, %arg3: f32):
    %59 = arith.addf %arg2, %cst : f32
    linalg.yield %59 : f32
  } -> tensor<1x40xf32>
  %41 = tensor.extract_slice %40[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %42 = tensor.extract_slice %40[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %43 = tensor.extract_slice %40[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %44 = tensor.expand_shape %41 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %47 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
  %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %23, %45, %46 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %59 = math.tanh %arg5 : f32
    %60 = arith.mulf %arg2, %cst_2 : f32
    %61 = math.tanh %60 : f32
    %62 = arith.mulf %61, %cst_2 : f32
    %63 = arith.addf %62, %cst_2 : f32
    %64 = arith.mulf %arg4, %cst_2 : f32
    %65 = math.tanh %64 : f32
    %66 = arith.mulf %65, %cst_2 : f32
    %67 = arith.addf %66, %cst_2 : f32
    %68 = arith.mulf %67, %59 : f32
    %69 = arith.mulf %63, %arg3 : f32
    %70 = arith.addf %69, %68 : f32
    %71 = arith.minf %70, %cst_0 : f32
    %72 = arith.maxf %71, %cst_1 : f32
    linalg.yield %72 : f32
  } -> tensor<1x10xf32>
  %49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
    %59 = arith.cmpf ogt, %arg2, %cst : f32
    %60 = arith.select %59, %arg3, %arg4 : f32
    linalg.yield %60 : f32
  } -> tensor<1x10xf32>
  %50 = tensor.extract_slice %40[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
  %51 = tensor.expand_shape %50 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
  %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %51, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
    %59 = math.tanh %arg5 : f32
    %60 = arith.mulf %arg4, %cst_2 : f32
    %61 = math.tanh %60 : f32
    %62 = arith.mulf %61, %cst_2 : f32
    %63 = arith.addf %62, %cst_2 : f32
    %64 = arith.mulf %63, %59 : f32
    %65 = arith.cmpf ogt, %arg2, %cst : f32
    %66 = arith.select %65, %arg3, %64 : f32
    linalg.yield %66 : f32
  } -> tensor<1x10xf32>
  %53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
  %54 = tensor.extract %22[] : tensor<i32>
  %55 = arith.maxsi %54, %c0_i32 : i32
  %56 = arith.minsi %55, %c4_i32 : i32
  %57 = arith.index_cast %56 : i32 to index
  %58 = tensor.insert_slice %53 into %25[%57, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
  cf.br ^bb1(%28, %49, %52, %58 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %25 : tensor<5x1x10xf32>
}

// -----// IR Dump After DispatchLinalgOnTensors //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c20 = arith.constant 20 : index
  %c0 = arith.constant 0 : index
  %c30 = arith.constant 30 : index
  %c64 = arith.constant 64 : index
  %c40 = arith.constant 40 : index
  %c10 = arith.constant 10 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = flow.tensor.splat %cst : tensor<1x5x64xf32>
  %1 = flow.tensor.reshape %arg1 : tensor<1x5x2x2xf32> -> tensor<5x4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c5, %c1](%1, %0) : (tensor<5x4xf32>, tensor<1x5x64xf32>) -> %0 =
      (%arg2: !flow.dispatch.tensor<readonly:5x4xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5x64xf32>) {
    %c0_3 = arith.constant 0 : index
    %c1_4 = arith.constant 1 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [5, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x4xf32> -> tensor<5x4xf32>
    flow.dispatch.tensor.store %35, %arg3, offsets = [%c0_3, %c0_3, %c0_3], sizes = [1, 5, 4], strides = [%c1_4, %c1_4, %c1_4] : tensor<5x4xf32> -> !flow.dispatch.tensor<readwrite:1x5x64xf32>
    flow.return
  }
  %3 = flow.tensor.reshape %2 : tensor<1x5x64xf32> -> tensor<5x1x64xf32>
  %4 = flow.tensor.reshape %arg0 : tensor<1x5xf32> -> tensor<5x1x1xf32>
  %5 = flow.tensor.splat %cst_0 : tensor<1x5xf32>
  %6 = flow.dispatch.workgroups[%c5, %c1, %c1](%arg0, %5) : (tensor<1x5xf32>, tensor<1x5xf32>) -> %5 =
      (%arg2: !flow.dispatch.tensor<readonly:1x5xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5xf32>) {
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x5xf32> -> tensor<1x5xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x5xf32> -> tensor<1x5xf32>
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<1x5xf32>) outs(%36 : tensor<1x5xf32>) {
    ^bb0(%arg4: f32, %arg5: f32):
      %38 = arith.minf %arg4, %arg5 : f32
      linalg.yield %38 : f32
    } -> tensor<1x5xf32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : tensor<1x5xf32> -> !flow.dispatch.tensor<readwrite:1x5xf32>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<1x5xf32> -> tensor<5xf32>
  %8 = flow.dispatch.workgroups[%c1, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c5_i32 = arith.constant 5 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%36 : tensor<i32>) -> tensor<i32>
    %38 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%37 : tensor<i32>) {
    ^bb0(%arg4: f32, %arg5: i32, %arg6: i32):
      %40 = arith.cmpf oeq, %arg4, %cst_3 : f32
      %41 = arith.extui %40 : i1 to i32
      %42 = arith.muli %41, %arg5 : i32
      %43 = arith.maxsi %42, %arg6 : i32
      linalg.yield %43 : i32
    } -> tensor<i32>
    %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%38 : tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg4: i32, %arg5: i32):
      %40 = arith.subi %c5_i32, %arg4 : i32
      linalg.yield %40 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %39, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  %9 = flow.dispatch.workgroups[%c5, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<5xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:5xf32>) {
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = linalg.init_tensor [5] : tensor<5xf32>
    %37 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%35 : tensor<5xf32>) outs(%36 : tensor<5xf32>) : tensor<5xf32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [0], sizes = [5], strides = [1] : tensor<5xf32> -> !flow.dispatch.tensor<writeonly:5xf32>
    flow.return
  }
  %10 = flow.dispatch.workgroups[%c1, %c1, %c1](%9, %8) : (tensor<5xf32>, tensor<i32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i32>) {
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c5_i32 = arith.constant 5 : i32
    %c0_i32 = arith.constant 0 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<i32>
    %38 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%37 : tensor<i32>) -> tensor<i32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg5: f32, %arg6: i32, %arg7: i32):
      %41 = arith.cmpf oeq, %arg5, %cst_3 : f32
      %42 = arith.extui %41 : i1 to i32
      %43 = arith.muli %42, %arg6 : i32
      %44 = arith.maxsi %43, %arg7 : i32
      linalg.yield %44 : i32
    } -> tensor<i32>
    %40 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36, %39 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i32>) {
    ^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
      %41 = arith.subi %c5_i32, %arg6 : i32
      %42 = arith.cmpi eq, %arg5, %c5_i32 : i32
      %43 = arith.select %42, %c0_i32, %41 : i32
      linalg.yield %43 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %40, %arg4, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  cf.br ^bb1(%10, %cst_2, %cst_2, %cst_1 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%11: tensor<i32>, %12: tensor<1x10xf32>, %13: tensor<1x10xf32>, %14: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %15 = flow.dispatch.workgroups[%c1, %c1, %c1](%11, %8) : (tensor<i32>, tensor<i32>) -> tensor<i1> =
      (%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i1>) {
    %c5_i32 = arith.constant 5 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<i1>
    %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %36 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i1>) {
    ^bb0(%arg5: i32, %arg6: i32, %arg7: i1):
      %39 = arith.subi %c5_i32, %arg6 : i32
      %40 = arith.cmpi slt, %arg5, %39 : i32
      linalg.yield %40 : i1
    } -> tensor<i1>
    flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:i1>
    flow.return
  }
  %16 = flow.tensor.load %15 : tensor<i1>
  cf.cond_br %16, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %17 = flow.dispatch.workgroups[%c1, %c1, %c1](%11) : (tensor<i32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
    %c1_i32 = arith.constant 1 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg4: i32, %arg5: i32):
      %38 = arith.addi %arg4, %c1_i32 : i32
      linalg.yield %38 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  %18 = flow.dispatch.workgroups[%c1, %c1, %c1](%4, %11) : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<f32> =
      (%arg2: !flow.dispatch.tensor<readonly:5x1x1xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
    %c0_3 = arith.constant 0 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x1xf32> -> tensor<5x1x1xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<f32>
    %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36 : tensor<i32>) outs(%37 : tensor<f32>) {
    ^bb0(%arg5: i32, %arg6: f32):
      %39 = arith.index_cast %arg5 : i32 to index
      %40 = tensor.extract %35[%39, %c0_3, %c0_3] : tensor<5x1x1xf32>
      linalg.yield %40 : f32
    } -> tensor<f32>
    flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
    flow.return
  }
  %19 = flow.dispatch.workgroups[%c64, %c1, %c1](%3, %11) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<64xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:5x1x64xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:64xf32>) {
    %c0_3 = arith.constant 0 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x64xf32> -> tensor<5x1x64xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [64] : tensor<64xf32>
    %38 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%36 : tensor<i32>) outs(%37 : tensor<64xf32>) {
    ^bb0(%arg5: i32, %arg6: f32):
      %39 = arith.index_cast %arg5 : i32 to index
      %40 = linalg.index 0 : index
      %41 = tensor.extract %35[%39, %c0_3, %40] : tensor<5x1x64xf32>
      linalg.yield %41 : f32
    } -> tensor<64xf32>
    flow.dispatch.tensor.store %38, %arg4, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
    flow.return
  }
  %20 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %21 = flow.tensor.reshape %19 : tensor<64xf32> -> tensor<1x64xf32>
  %22 = flow.tensor.update %21, %20[%c0, %c0] : tensor<1x64xf32> -> %20 as tensor<1x74xf32>
  %23 = flow.tensor.update %13, %22[%c0, %c64] : tensor<1x10xf32> -> %22 as tensor<1x74xf32>
  %24 = flow.dispatch.workgroups[%c40, %c1, %c1](%23) : (tensor<1x74xf32>) -> tensor<1x40xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x74xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x40xf32>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 74], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x74xf32> -> tensor<1x74xf32>
    %36 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %37 = linalg.fill ins(%cst_3 : f32) outs(%36 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %38 = linalg.matmul ins(%35, %cst_4 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%38 : tensor<1x40xf32>) outs(%36 : tensor<1x40xf32>) {
    ^bb0(%arg4: f32, %arg5: f32):
      %40 = arith.addf %arg4, %cst_3 : f32
      linalg.yield %40 : f32
    } -> tensor<1x40xf32>
    flow.dispatch.tensor.store %39, %arg3, offsets = [0, 0], sizes = [1, 40], strides = [1, 1] : tensor<1x40xf32> -> !flow.dispatch.tensor<writeonly:1x40xf32>
    flow.return
  }
  %25 = flow.tensor.slice %24[%c0, %c20 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %26 = flow.tensor.slice %24[%c0, %c10 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %27 = flow.tensor.slice %24[%c0, %c0 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %28 = flow.dispatch.workgroups[%c10, %c1, %c1](%25, %12, %26, %27) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x10xf32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
    %cst_3 = arith.constant 5.000000e-01 : f32
    %cst_4 = arith.constant 1.000000e+01 : f32
    %cst_5 = arith.constant -1.000000e+01 : f32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
      %41 = math.tanh %arg10 : f32
      %42 = arith.mulf %arg7, %cst_3 : f32
      %43 = math.tanh %42 : f32
      %44 = arith.mulf %43, %cst_3 : f32
      %45 = arith.addf %44, %cst_3 : f32
      %46 = arith.mulf %arg9, %cst_3 : f32
      %47 = math.tanh %46 : f32
      %48 = arith.mulf %47, %cst_3 : f32
      %49 = arith.addf %48, %cst_3 : f32
      %50 = arith.mulf %49, %41 : f32
      %51 = arith.mulf %45, %arg8 : f32
      %52 = arith.addf %51, %50 : f32
      %53 = arith.minf %52, %cst_4 : f32
      %54 = arith.maxf %53, %cst_5 : f32
      linalg.yield %54 : f32
    } -> tensor<1x10xf32>
    flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
    flow.return
  }
  %29 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %12, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<writeonly:1x10xf32>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %38 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%38 : tensor<1x10xf32>) {
    ^bb0(%arg6: f32, %arg7: f32, %arg8: f32, %arg9: f32):
      %40 = arith.cmpf ogt, %arg6, %cst_3 : f32
      %41 = arith.select %40, %arg7, %arg8 : f32
      linalg.yield %41 : f32
    } -> tensor<1x10xf32>
    flow.dispatch.tensor.store %39, %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
    flow.return
  }
  %30 = flow.tensor.slice %24[%c0, %c30 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %31 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %13, %30, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
    %cst_3 = arith.constant 5.000000e-01 : f32
    %cst_4 = arith.constant 0.000000e+00 : f32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
      %41 = math.tanh %arg10 : f32
      %42 = arith.mulf %arg9, %cst_3 : f32
      %43 = math.tanh %42 : f32
      %44 = arith.mulf %43, %cst_3 : f32
      %45 = arith.addf %44, %cst_3 : f32
      %46 = arith.mulf %45, %41 : f32
      %47 = arith.cmpf ogt, %arg7, %cst_4 : f32
      %48 = arith.select %47, %arg8, %46 : f32
      linalg.yield %48 : f32
    } -> tensor<1x10xf32>
    flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
    flow.return
  }
  %32 = flow.tensor.reshape %31 : tensor<1x10xf32> -> tensor<10xf32>
  %33 = flow.tensor.load %11 : tensor<i32>
  %34 = flow.dispatch.workgroups[%c10, %c1, %c1](%32, %14, %33) : (tensor<10xf32>, tensor<5x1x10xf32>, i32) -> %14 =
      (%arg2: !flow.dispatch.tensor<readonly:10xf32>, %arg3: !flow.dispatch.tensor<readwrite:5x1x10xf32>, %arg4: i32) {
    %c1_3 = arith.constant 1 : index
    %c0_4 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %c4_i32 = arith.constant 4 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:10xf32> -> tensor<10xf32>
    %36 = arith.maxsi %arg4, %c0_i32 : i32
    %37 = arith.minsi %36, %c4_i32 : i32
    %38 = arith.index_cast %37 : i32 to index
    flow.dispatch.tensor.store %35, %arg3, offsets = [%38, %c0_4, %c0_4], sizes = [1, 1, 10], strides = [%c1_3, %c1_3, %c1_3] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:5x1x10xf32>
    flow.return
  }
  cf.br ^bb1(%17, %29, %31, %34 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %14 : tensor<5x1x10xf32>
}

// -----// IR Dump After CaptureDispatchDynamicDims //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c20 = arith.constant 20 : index
  %c0 = arith.constant 0 : index
  %c30 = arith.constant 30 : index
  %c64 = arith.constant 64 : index
  %c40 = arith.constant 40 : index
  %c10 = arith.constant 10 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = flow.tensor.splat %cst : tensor<1x5x64xf32>
  %1 = flow.tensor.reshape %arg1 : tensor<1x5x2x2xf32> -> tensor<5x4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c5, %c1](%1, %0) : (tensor<5x4xf32>, tensor<1x5x64xf32>) -> %0 =
      (%arg2: !flow.dispatch.tensor<readonly:5x4xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5x64xf32>) {
    %c0_3 = arith.constant 0 : index
    %c1_4 = arith.constant 1 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [5, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x4xf32> -> tensor<5x4xf32>
    flow.dispatch.tensor.store %35, %arg3, offsets = [%c0_3, %c0_3, %c0_3], sizes = [1, 5, 4], strides = [%c1_4, %c1_4, %c1_4] : tensor<5x4xf32> -> !flow.dispatch.tensor<readwrite:1x5x64xf32>
    flow.return
  }
  %3 = flow.tensor.reshape %2 : tensor<1x5x64xf32> -> tensor<5x1x64xf32>
  %4 = flow.tensor.reshape %arg0 : tensor<1x5xf32> -> tensor<5x1x1xf32>
  %5 = flow.tensor.splat %cst_0 : tensor<1x5xf32>
  %6 = flow.dispatch.workgroups[%c5, %c1, %c1](%arg0, %5) : (tensor<1x5xf32>, tensor<1x5xf32>) -> %5 =
      (%arg2: !flow.dispatch.tensor<readonly:1x5xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5xf32>) {
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x5xf32> -> tensor<1x5xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x5xf32> -> tensor<1x5xf32>
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<1x5xf32>) outs(%36 : tensor<1x5xf32>) {
    ^bb0(%arg4: f32, %arg5: f32):
      %38 = arith.minf %arg4, %arg5 : f32
      linalg.yield %38 : f32
    } -> tensor<1x5xf32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : tensor<1x5xf32> -> !flow.dispatch.tensor<readwrite:1x5xf32>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<1x5xf32> -> tensor<5xf32>
  %8 = flow.dispatch.workgroups[%c1, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c5_i32 = arith.constant 5 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%36 : tensor<i32>) -> tensor<i32>
    %38 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%37 : tensor<i32>) {
    ^bb0(%arg4: f32, %arg5: i32, %arg6: i32):
      %40 = arith.cmpf oeq, %arg4, %cst_3 : f32
      %41 = arith.extui %40 : i1 to i32
      %42 = arith.muli %41, %arg5 : i32
      %43 = arith.maxsi %42, %arg6 : i32
      linalg.yield %43 : i32
    } -> tensor<i32>
    %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%38 : tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg4: i32, %arg5: i32):
      %40 = arith.subi %c5_i32, %arg4 : i32
      linalg.yield %40 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %39, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  %9 = flow.dispatch.workgroups[%c5, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<5xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:5xf32>) {
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = linalg.init_tensor [5] : tensor<5xf32>
    %37 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%35 : tensor<5xf32>) outs(%36 : tensor<5xf32>) : tensor<5xf32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [0], sizes = [5], strides = [1] : tensor<5xf32> -> !flow.dispatch.tensor<writeonly:5xf32>
    flow.return
  }
  %10 = flow.dispatch.workgroups[%c1, %c1, %c1](%9, %8) : (tensor<5xf32>, tensor<i32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i32>) {
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c5_i32 = arith.constant 5 : i32
    %c0_i32 = arith.constant 0 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<i32>
    %38 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%37 : tensor<i32>) -> tensor<i32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg5: f32, %arg6: i32, %arg7: i32):
      %41 = arith.cmpf oeq, %arg5, %cst_3 : f32
      %42 = arith.extui %41 : i1 to i32
      %43 = arith.muli %42, %arg6 : i32
      %44 = arith.maxsi %43, %arg7 : i32
      linalg.yield %44 : i32
    } -> tensor<i32>
    %40 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36, %39 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i32>) {
    ^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
      %41 = arith.subi %c5_i32, %arg6 : i32
      %42 = arith.cmpi eq, %arg5, %c5_i32 : i32
      %43 = arith.select %42, %c0_i32, %41 : i32
      linalg.yield %43 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %40, %arg4, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  cf.br ^bb1(%10, %cst_2, %cst_2, %cst_1 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%11: tensor<i32>, %12: tensor<1x10xf32>, %13: tensor<1x10xf32>, %14: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %15 = flow.dispatch.workgroups[%c1, %c1, %c1](%11, %8) : (tensor<i32>, tensor<i32>) -> tensor<i1> =
      (%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i1>) {
    %c5_i32 = arith.constant 5 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<i1>
    %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %36 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i1>) {
    ^bb0(%arg5: i32, %arg6: i32, %arg7: i1):
      %39 = arith.subi %c5_i32, %arg6 : i32
      %40 = arith.cmpi slt, %arg5, %39 : i32
      linalg.yield %40 : i1
    } -> tensor<i1>
    flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:i1>
    flow.return
  }
  %16 = flow.tensor.load %15 : tensor<i1>
  cf.cond_br %16, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %17 = flow.dispatch.workgroups[%c1, %c1, %c1](%11) : (tensor<i32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
    %c1_i32 = arith.constant 1 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg4: i32, %arg5: i32):
      %38 = arith.addi %arg4, %c1_i32 : i32
      linalg.yield %38 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  %18 = flow.dispatch.workgroups[%c1, %c1, %c1](%4, %11) : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<f32> =
      (%arg2: !flow.dispatch.tensor<readonly:5x1x1xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
    %c0_3 = arith.constant 0 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x1xf32> -> tensor<5x1x1xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<f32>
    %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36 : tensor<i32>) outs(%37 : tensor<f32>) {
    ^bb0(%arg5: i32, %arg6: f32):
      %39 = arith.index_cast %arg5 : i32 to index
      %40 = tensor.extract %35[%39, %c0_3, %c0_3] : tensor<5x1x1xf32>
      linalg.yield %40 : f32
    } -> tensor<f32>
    flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
    flow.return
  }
  %19 = flow.dispatch.workgroups[%c64, %c1, %c1](%3, %11) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<64xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:5x1x64xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:64xf32>) {
    %c0_3 = arith.constant 0 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x64xf32> -> tensor<5x1x64xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [64] : tensor<64xf32>
    %38 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%36 : tensor<i32>) outs(%37 : tensor<64xf32>) {
    ^bb0(%arg5: i32, %arg6: f32):
      %39 = arith.index_cast %arg5 : i32 to index
      %40 = linalg.index 0 : index
      %41 = tensor.extract %35[%39, %c0_3, %40] : tensor<5x1x64xf32>
      linalg.yield %41 : f32
    } -> tensor<64xf32>
    flow.dispatch.tensor.store %38, %arg4, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
    flow.return
  }
  %20 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %21 = flow.tensor.reshape %19 : tensor<64xf32> -> tensor<1x64xf32>
  %22 = flow.tensor.update %21, %20[%c0, %c0] : tensor<1x64xf32> -> %20 as tensor<1x74xf32>
  %23 = flow.tensor.update %13, %22[%c0, %c64] : tensor<1x10xf32> -> %22 as tensor<1x74xf32>
  %24 = flow.dispatch.workgroups[%c40, %c1, %c1](%23) : (tensor<1x74xf32>) -> tensor<1x40xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x74xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x40xf32>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 74], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x74xf32> -> tensor<1x74xf32>
    %36 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %37 = linalg.fill ins(%cst_3 : f32) outs(%36 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %38 = linalg.matmul ins(%35, %cst_4 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%38 : tensor<1x40xf32>) outs(%36 : tensor<1x40xf32>) {
    ^bb0(%arg4: f32, %arg5: f32):
      %40 = arith.addf %arg4, %cst_3 : f32
      linalg.yield %40 : f32
    } -> tensor<1x40xf32>
    flow.dispatch.tensor.store %39, %arg3, offsets = [0, 0], sizes = [1, 40], strides = [1, 1] : tensor<1x40xf32> -> !flow.dispatch.tensor<writeonly:1x40xf32>
    flow.return
  }
  %25 = flow.tensor.slice %24[%c0, %c20 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %26 = flow.tensor.slice %24[%c0, %c10 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %27 = flow.tensor.slice %24[%c0, %c0 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %28 = flow.dispatch.workgroups[%c10, %c1, %c1](%25, %12, %26, %27) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x10xf32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
    %cst_3 = arith.constant 5.000000e-01 : f32
    %cst_4 = arith.constant 1.000000e+01 : f32
    %cst_5 = arith.constant -1.000000e+01 : f32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
      %41 = math.tanh %arg10 : f32
      %42 = arith.mulf %arg7, %cst_3 : f32
      %43 = math.tanh %42 : f32
      %44 = arith.mulf %43, %cst_3 : f32
      %45 = arith.addf %44, %cst_3 : f32
      %46 = arith.mulf %arg9, %cst_3 : f32
      %47 = math.tanh %46 : f32
      %48 = arith.mulf %47, %cst_3 : f32
      %49 = arith.addf %48, %cst_3 : f32
      %50 = arith.mulf %49, %41 : f32
      %51 = arith.mulf %45, %arg8 : f32
      %52 = arith.addf %51, %50 : f32
      %53 = arith.minf %52, %cst_4 : f32
      %54 = arith.maxf %53, %cst_5 : f32
      linalg.yield %54 : f32
    } -> tensor<1x10xf32>
    flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
    flow.return
  }
  %29 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %12, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<writeonly:1x10xf32>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %38 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%38 : tensor<1x10xf32>) {
    ^bb0(%arg6: f32, %arg7: f32, %arg8: f32, %arg9: f32):
      %40 = arith.cmpf ogt, %arg6, %cst_3 : f32
      %41 = arith.select %40, %arg7, %arg8 : f32
      linalg.yield %41 : f32
    } -> tensor<1x10xf32>
    flow.dispatch.tensor.store %39, %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
    flow.return
  }
  %30 = flow.tensor.slice %24[%c0, %c30 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
  %31 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %13, %30, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
    %cst_3 = arith.constant 5.000000e-01 : f32
    %cst_4 = arith.constant 0.000000e+00 : f32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
    %39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
    %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
      %41 = math.tanh %arg10 : f32
      %42 = arith.mulf %arg9, %cst_3 : f32
      %43 = math.tanh %42 : f32
      %44 = arith.mulf %43, %cst_3 : f32
      %45 = arith.addf %44, %cst_3 : f32
      %46 = arith.mulf %45, %41 : f32
      %47 = arith.cmpf ogt, %arg7, %cst_4 : f32
      %48 = arith.select %47, %arg8, %46 : f32
      linalg.yield %48 : f32
    } -> tensor<1x10xf32>
    flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
    flow.return
  }
  %32 = flow.tensor.reshape %31 : tensor<1x10xf32> -> tensor<10xf32>
  %33 = flow.tensor.load %11 : tensor<i32>
  %34 = flow.dispatch.workgroups[%c10, %c1, %c1](%32, %14, %33) : (tensor<10xf32>, tensor<5x1x10xf32>, i32) -> %14 =
      (%arg2: !flow.dispatch.tensor<readonly:10xf32>, %arg3: !flow.dispatch.tensor<readwrite:5x1x10xf32>, %arg4: i32) {
    %c1_3 = arith.constant 1 : index
    %c0_4 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %c4_i32 = arith.constant 4 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:10xf32> -> tensor<10xf32>
    %36 = arith.maxsi %arg4, %c0_i32 : i32
    %37 = arith.minsi %36, %c4_i32 : i32
    %38 = arith.index_cast %37 : i32 to index
    flow.dispatch.tensor.store %35, %arg3, offsets = [%38, %c0_4, %c0_4], sizes = [1, 1, 10], strides = [%c1_3, %c1_3, %c1_3] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:5x1x10xf32>
    flow.return
  }
  cf.br ^bb1(%17, %29, %31, %34 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3:  // pred: ^bb1
  return %14 : tensor<5x1x10xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
  %c5 = arith.constant 5 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c20 = arith.constant 20 : index
  %c0 = arith.constant 0 : index
  %c30 = arith.constant 30 : index
  %c64 = arith.constant 64 : index
  %c40 = arith.constant 40 : index
  %c10 = arith.constant 10 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 0x7F800000 : f32
  %cst_1 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
  %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
  %0 = flow.tensor.splat %cst : tensor<1x5x64xf32>
  %1 = flow.tensor.reshape %arg1 : tensor<1x5x2x2xf32> -> tensor<5x4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c5, %c1](%1, %0) : (tensor<5x4xf32>, tensor<1x5x64xf32>) -> %0 =
      (%arg2: !flow.dispatch.tensor<readonly:5x4xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5x64xf32>) {
    %c0_3 = arith.constant 0 : index
    %c1_4 = arith.constant 1 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [5, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x4xf32> -> tensor<5x4xf32>
    flow.dispatch.tensor.store %35, %arg3, offsets = [%c0_3, %c0_3, %c0_3], sizes = [1, 5, 4], strides = [%c1_4, %c1_4, %c1_4] : tensor<5x4xf32> -> !flow.dispatch.tensor<readwrite:1x5x64xf32>
    flow.return
  }
  %3 = flow.tensor.reshape %2 : tensor<1x5x64xf32> -> tensor<5x1x64xf32>
  %4 = flow.tensor.reshape %arg0 : tensor<1x5xf32> -> tensor<5x1x1xf32>
  %5 = flow.tensor.splat %cst_0 : tensor<1x5xf32>
  %6 = flow.dispatch.workgroups[%c5, %c1, %c1](%arg0, %5) : (tensor<1x5xf32>, tensor<1x5xf32>) -> %5 =
      (%arg2: !flow.dispatch.tensor<readonly:1x5xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5xf32>) {
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x5xf32> -> tensor<1x5xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x5xf32> -> tensor<1x5xf32>
    %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<1x5xf32>) outs(%36 : tensor<1x5xf32>) {
    ^bb0(%arg4: f32, %arg5: f32):
      %38 = arith.minf %arg4, %arg5 : f32
      linalg.yield %38 : f32
    } -> tensor<1x5xf32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : tensor<1x5xf32> -> !flow.dispatch.tensor<readwrite:1x5xf32>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<1x5xf32> -> tensor<5xf32>
  %8 = flow.dispatch.workgroups[%c1, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c5_i32 = arith.constant 5 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%36 : tensor<i32>) -> tensor<i32>
    %38 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%37 : tensor<i32>) {
    ^bb0(%arg4: f32, %arg5: i32, %arg6: i32):
      %40 = arith.cmpf oeq, %arg4, %cst_3 : f32
      %41 = arith.extui %40 : i1 to i32
      %42 = arith.muli %41, %arg5 : i32
      %43 = arith.maxsi %42, %arg6 : i32
      linalg.yield %43 : i32
    } -> tensor<i32>
    %39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%38 : tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg4: i32, %arg5: i32):
      %40 = arith.subi %c5_i32, %arg4 : i32
      linalg.yield %40 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %39, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  %9 = flow.dispatch.workgroups[%c5, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<5xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:5xf32>) {
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = linalg.init_tensor [5] : tensor<5xf32>
    %37 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%35 : tensor<5xf32>) outs(%36 : tensor<5xf32>) : tensor<5xf32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [0], sizes = [5], strides = [1] : tensor<5xf32> -> !flow.dispatch.tensor<writeonly:5xf32>
    flow.return
  }
  %10 = flow.dispatch.workgroups[%c1, %c1, %c1](%9, %8) : (tensor<5xf32>, tensor<i32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i32>) {
    %c-2147483648_i32 = arith.constant -2147483648 : i32
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c5_i32 = arith.constant 5 : i32
    %c0_i32 = arith.constant 0 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<i32>
    %38 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%37 : tensor<i32>) -> tensor<i32>
    %39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%38 : tensor<i32>) {
    ^bb0(%arg5: f32, %arg6: i32, %arg7: i32):
      %41 = arith.cmpf oeq, %arg5, %cst_3 : f32
      %42 = arith.extui %41 : i1 to i32
      %43 = arith.muli %42, %arg6 : i32
      %44 = arith.maxsi %43, %arg7 : i32
      linalg.yield %44 : i32
    } -> tensor<i32>
    %40 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36, %39 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i32>) {
    ^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
      %41 = arith.subi %c5_i32, %arg6 : i32
      %42 = arith.cmpi eq, %arg5, %c5_i32 : i32
      %43 = arith.select %42, %c0_i32, %41 : i32
      linalg.yield %43 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %40, %arg4, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  cf.br ^bb1(%10, %cst_2, %cst_2, %cst_1 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%11: tensor<i32>, %12: tensor<1x10xf32>, %13: tensor<1x10xf32>, %14: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
  %15 = flow.dispatch.workgroups[%c1, %c1, %c1](%11, %8) : (tensor<i32>, tensor<i32>) -> tensor<i1> =
      (%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i1>) {
    %c5_i32 = arith.constant 5 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<i1>
    %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %36 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i1>) {
    ^bb0(%arg5: i32, %arg6: i32, %arg7: i1):
      %39 = arith.subi %c5_i32, %arg6 : i32
      %40 = arith.cmpi slt, %arg5, %39 : i32
      linalg.yield %40 : i1
    } -> tensor<i1>
    flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:i1>
    flow.return
  }
  %16 = flow.tensor.load %15 : tensor<i1>
  cf.cond_br %16, ^bb2, ^bb3
^bb2:  // pred: ^bb1
  %17 = flow.dispatch.workgroups[%c1, %c1, %c1](%11) : (tensor<i32>) -> tensor<i32> =
      (%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
    %c1_i32 = arith.constant 1 : i32
    %35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %36 = linalg.init_tensor [] : tensor<i32>
    %37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%36 : tensor<i32>) {
    ^bb0(%arg4: i32, %arg5: i32):
      %38 = arith.addi %arg4, %c1_i32 : i32
      linalg.yield %38 : i32
    } -> tensor<i32>
    flow.dispatch.tensor.store %37, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
    flow.return
  }
  %18 = flow.dispatch.workgroups[%c1, %c1, %c1](%4, %11) : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<f32> =
      (%arg2: !flow.dispatch.tensor<readonly:5x1x1xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
    %c0_3 = arith.constant 0 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x1xf32> -> tensor<5x1x1xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [] : tensor<f32>
    %38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36 : tensor<i32>) outs(%37 : tensor<f32>) {
    ^bb0(%arg5: i32, %arg6: f32):
      %39 = arith.index_cast %arg5 : i32 to index
      %40 = tensor.extract %35[%39, %c0_3, %c0_3] : tensor<5x1x1xf32>
      linalg.yield %40 : f32
    } -> tensor<f32>
    flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
    flow.return
  }
  %19 = flow.dispatch.workgroups[%c64, %c1, %c1](%3, %11) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<64xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:5x1x64xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:64xf32>) {
    %c0_3 = arith.constant 0 : index
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x64xf32> -> tensor<5x1x64xf32>
    %36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
    %37 = linalg.init_tensor [64] : tensor<64xf32>
    %38 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%36 : tensor<i32>) outs(%37 : tensor<64xf32>) {
    ^bb0(%arg5: i32, %arg6: f32):
      %39 = arith.index_cast %arg5 : i32 to index
      %40 = linalg.index 0 : index
      %41 = tensor.extract %35[%39, %c0_3, %40] : tensor<5x1x64xf32>
      linalg.yield %41 : f32
    } -> tensor<64xf32>
    flow.dispatch.tensor.store %38, %arg4, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
    flow.return
  }
  %20 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
  %21 = flow.tensor.reshape %19 : tensor<64xf32> -> tensor<1x64xf32>
  %22 = flow.tensor.update %21, %20[%c0, %c0] : tensor<1x64xf32> -> %20 as tensor<1x74xf32>
  %23 = flow.tensor.update %13, %22[%c0, %c64] : tensor<1x10xf32> -> %22 as tensor<1x74xf32>
  %24 = flow.dispatch.workgroups[%c40, %c1, %c1](%23) : (tensor<1x74xf32>) -> tensor<1x40xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x74xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x40xf32>) {
    %cst_3 = arith.constant 0.000000e+00 : f32
    %cst_4 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
    %35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 74], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x74xf32> -> tensor<1x74xf32>
    %36 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
    %37 = linalg.fill ins(%cst_3 :