Skip to content

Instantly share code, notes, and snippets.

@benvanik
Created June 15, 2022 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benvanik/09888b79014e5c32530988422be05968 to your computer and use it in GitHub Desktop.
Save benvanik/09888b79014e5c32530988422be05968 to your computer and use it in GitHub Desktop.
unidirectional_lstm.mlir
This file has been truncated, but you can view the full file.
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<1.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%8 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%9 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%115 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%115) : (tensor<f32>) -> ()
}
%10 = "mhlo.compare"(%9, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%11 = mhlo.convert(%10) : (tensor<5xi1>) -> tensor<5xi32>
%12 = mhlo.multiply %11, %cst_0 : tensor<5xi32>
%13 = mhlo.reduce(%12 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%115 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%115) : (tensor<i32>) -> ()
}
%14 = mhlo.subtract %cst_2, %13 : tensor<i32>
%15 = "mhlo.compare"(%14, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%16 = "mhlo.reverse"(%9) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%17 = "mhlo.compare"(%16, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%18 = mhlo.convert(%17) : (tensor<5xi1>) -> tensor<5xi32>
%19 = mhlo.multiply %18, %cst_0 : tensor<5xi32>
%20 = mhlo.reduce(%19 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%115 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%115) : (tensor<i32>) -> ()
}
%21 = mhlo.subtract %cst_2, %20 : tensor<i32>
%22 = "mhlo.select"(%15, %cst_6, %21) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%23 = mhlo.convert(%22) : (tensor<i32>) -> tensor<i64>
%24 = mhlo.subtract %cst_2, %14 : tensor<i32>
%25 = mhlo.convert(%24) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%23, %25, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%26: tensor<i64>, %27: tensor<i64>, %28: tensor<40xf32>, %29: tensor<i64>, %30: tensor<74x40xf32>, %31: tensor<i64>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x64xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5xi64>, %38: tensor<5x1x10xf32>, %39: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%40 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%41 = tensor.extract %40[] : tensor<i1>
cf.cond_br %41, ^bb2(%26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%26, %31, %32, %33, %37, %38, %39 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%42: tensor<i64>, %43: tensor<i64>, %44: tensor<40xf32>, %45: tensor<i64>, %46: tensor<74x40xf32>, %47: tensor<i64>, %48: tensor<1x10xf32>, %49: tensor<1x10xf32>, %50: tensor<5x1x64xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5x1x1xf32>, %53: tensor<5xi64>, %54: tensor<5x1x10xf32>, %55: tensor<5x1x10xf32>): // pred: ^bb1
%56 = mhlo.add %42, %cst_5 : tensor<i64>
%57 = "mhlo.gather"(%51, %42) {dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [0], start_index_map = [0]>, slice_sizes = dense<1> : tensor<3xi64>} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%58 = "mhlo.reshape"(%57) : (tensor<1x1xf32>) -> tensor<1xf32>
%59 = "mhlo.broadcast_in_dim"(%58) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%60 = mhlo.multiply %59, %6 : tensor<1x10xf32>
%61 = "mhlo.compare"(%60, %7) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%62 = "mhlo.gather"(%50, %42) {dimension_numbers = #mhlo.gather<offset_dims = [0, 1], collapsed_slice_dims = [0], start_index_map = [0]>, slice_sizes = dense<[1, 1, 64]> : tensor<3xi64>} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%63 = "mhlo.concatenate"(%62, %49) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%64 = "mhlo.dot"(%63, %46) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%65 = "mhlo.reshape"(%44) : (tensor<40xf32>) -> tensor<1x40xf32>
%66 = mhlo.add %64, %65 : tensor<1x40xf32>
%67 = "mhlo.slice"(%66) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%68 = mhlo.multiply %67, %8 : tensor<1x10xf32>
%69 = mhlo.tanh %68 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %8 : tensor<1x10xf32>
%71 = mhlo.add %70, %8 : tensor<1x10xf32>
%72 = mhlo.multiply %71, %48 : tensor<1x10xf32>
%73 = "mhlo.slice"(%66) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%74 = mhlo.multiply %73, %8 : tensor<1x10xf32>
%75 = mhlo.tanh %74 : tensor<1x10xf32>
%76 = mhlo.multiply %75, %8 : tensor<1x10xf32>
%77 = mhlo.add %76, %8 : tensor<1x10xf32>
%78 = "mhlo.slice"(%66) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%79 = mhlo.tanh %78 : tensor<1x10xf32>
%80 = mhlo.multiply %77, %79 : tensor<1x10xf32>
%81 = mhlo.add %72, %80 : tensor<1x10xf32>
%82 = mhlo.minimum %81, %4 : tensor<1x10xf32>
%83 = mhlo.maximum %82, %5 : tensor<1x10xf32>
%84 = "mhlo.select"(%61, %48, %83) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%85 = "mhlo.reshape"(%57) : (tensor<1x1xf32>) -> tensor<1xf32>
%86 = "mhlo.broadcast_in_dim"(%85) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %6 : tensor<1x10xf32>
%88 = "mhlo.compare"(%87, %7) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%89 = "mhlo.slice"(%66) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%90 = mhlo.multiply %89, %8 : tensor<1x10xf32>
%91 = mhlo.tanh %90 : tensor<1x10xf32>
%92 = mhlo.multiply %91, %8 : tensor<1x10xf32>
%93 = mhlo.add %92, %8 : tensor<1x10xf32>
%94 = mhlo.tanh %83 : tensor<1x10xf32>
%95 = mhlo.multiply %93, %94 : tensor<1x10xf32>
%96 = "mhlo.select"(%88, %49, %95) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%97 = "mhlo.reshape"(%47) : (tensor<i64>) -> tensor<1xi64>
%98 = "mhlo.reshape"(%42) : (tensor<i64>) -> tensor<1xi64>
%99 = mhlo.convert(%98) : (tensor<1xi64>) -> tensor<1xi32>
%100 = "mhlo.reshape"(%99) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %97, %100) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%102 = "mhlo.reshape"(%84) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%99) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%105 = "mhlo.reshape"(%96) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%106 = "mhlo.reshape"(%99) : (tensor<1xi32>) -> tensor<i32>
%107 = "mhlo.dynamic-update-slice"(%55, %105, %106, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%56, %43, %44, %45, %46, %47, %84, %96, %50, %51, %52, %101, %104, %107 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%108: tensor<i64>, %109: tensor<i64>, %110: tensor<1x10xf32>, %111: tensor<1x10xf32>, %112: tensor<5xi64>, %113: tensor<5x1x10xf32>, %114: tensor<5x1x10xf32>): // pred: ^bb1
return %108, %112, %113, %114, %109, %110, %111 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%112) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>): // pred: ^bb1
%55 = mhlo.add %41, %cst_5 : tensor<i64>
%56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
%64 = mhlo.add %62, %63 : tensor<1x40xf32>
%65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.add %68, %7 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
%71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
%75 = mhlo.add %74, %7 : tensor<1x10xf32>
%76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
%79 = mhlo.add %70, %78 : tensor<1x10xf32>
%80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
%81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
%82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
%88 = mhlo.tanh %87 : tensor<1x10xf32>
%89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
%90 = mhlo.add %89, %7 : tensor<1x10xf32>
%91 = mhlo.tanh %81 : tensor<1x10xf32>
%92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
%93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
%95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
%96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
%97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>): // pred: ^bb1
return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%112) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>): // pred: ^bb1
%55 = mhlo.add %41, %cst_5 : tensor<i64>
%56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
%64 = mhlo.add %62, %63 : tensor<1x40xf32>
%65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.add %68, %7 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
%71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
%75 = mhlo.add %74, %7 : tensor<1x10xf32>
%76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
%79 = mhlo.add %70, %78 : tensor<1x10xf32>
%80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
%81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
%82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
%88 = mhlo.tanh %87 : tensor<1x10xf32>
%89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
%90 = mhlo.add %89, %7 : tensor<1x10xf32>
%91 = mhlo.tanh %81 : tensor<1x10xf32>
%92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
%93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
%95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
%96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
%97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>): // pred: ^bb1
return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%112) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>): // pred: ^bb1
%55 = mhlo.add %41, %cst_5 : tensor<i64>
%56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
%64 = mhlo.add %62, %63 : tensor<1x40xf32>
%65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.add %68, %7 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
%71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
%75 = mhlo.add %74, %7 : tensor<1x10xf32>
%76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
%79 = mhlo.add %70, %78 : tensor<1x10xf32>
%80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
%81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
%82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
%88 = mhlo.tanh %87 : tensor<1x10xf32>
%89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
%90 = mhlo.add %89, %7 : tensor<1x10xf32>
%91 = mhlo.tanh %81 : tensor<1x10xf32>
%92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
%93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
%95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
%96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
%97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>): // pred: ^bb1
return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
// -----// IR Dump After ConvertShapeToStandard //----- //
module {
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%112) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>): // pred: ^bb1
%55 = mhlo.add %41, %cst_5 : tensor<i64>
%56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
%64 = mhlo.add %62, %63 : tensor<1x40xf32>
%65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.add %68, %7 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
%71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
%75 = mhlo.add %74, %7 : tensor<1x10xf32>
%76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
%79 = mhlo.add %70, %78 : tensor<1x10xf32>
%80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
%81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
%82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
%88 = mhlo.tanh %87 : tensor<1x10xf32>
%89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
%90 = mhlo.add %89, %7 : tensor<1x10xf32>
%91 = mhlo.tanh %81 : tensor<1x10xf32>
%92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
%93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
%95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
%96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
%97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>): // pred: ^bb1
return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%112) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>): // pred: ^bb1
%55 = mhlo.add %41, %cst_5 : tensor<i64>
%56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
%64 = mhlo.add %62, %63 : tensor<1x40xf32>
%65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.add %68, %7 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
%71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
%75 = mhlo.add %74, %7 : tensor<1x10xf32>
%76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
%79 = mhlo.add %70, %78 : tensor<1x10xf32>
%80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
%81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
%82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
%88 = mhlo.tanh %87 : tensor<1x10xf32>
%89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
%90 = mhlo.add %89, %7 : tensor<1x10xf32>
%91 = mhlo.tanh %81 : tensor<1x10xf32>
%92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
%93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
%95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
%96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
%97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>): // pred: ^bb1
return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%112 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%112) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%112 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%112) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_4, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %arg4, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<i64>, %29: tensor<74x40xf32>, %30: tensor<i64>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5xi64>, %37: tensor<5x1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = call @ForwardLoopCond_gFAnjWGSoLs__.167(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38) : (tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>) -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %30, %31, %32, %36, %37, %38 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%41: tensor<i64>, %42: tensor<i64>, %43: tensor<40xf32>, %44: tensor<i64>, %45: tensor<74x40xf32>, %46: tensor<i64>, %47: tensor<1x10xf32>, %48: tensor<1x10xf32>, %49: tensor<5x1x64xf32>, %50: tensor<5x1x1xf32>, %51: tensor<5x1x1xf32>, %52: tensor<5xi64>, %53: tensor<5x1x10xf32>, %54: tensor<5x1x10xf32>): // pred: ^bb1
%55 = mhlo.add %41, %cst_5 : tensor<i64>
%56 = "mhlo.torch_index_select"(%50, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%57 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%58 = "mhlo.broadcast_in_dim"(%57) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%59 = "mhlo.compare"(%58, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%60 = "mhlo.torch_index_select"(%49, %41) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%61 = "mhlo.concatenate"(%60, %48) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%62 = "mhlo.dot"(%61, %45) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%63 = "mhlo.reshape"(%43) : (tensor<40xf32>) -> tensor<1x40xf32>
%64 = mhlo.add %62, %63 : tensor<1x40xf32>
%65 = "mhlo.slice"(%64) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %7 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.add %68, %7 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %47 : tensor<1x10xf32>
%71 = "mhlo.slice"(%64) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%72 = mhlo.multiply %71, %7 : tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %73, %7 : tensor<1x10xf32>
%75 = mhlo.add %74, %7 : tensor<1x10xf32>
%76 = "mhlo.slice"(%64) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %75, %77 : tensor<1x10xf32>
%79 = mhlo.add %70, %78 : tensor<1x10xf32>
%80 = mhlo.minimum %79, %4 : tensor<1x10xf32>
%81 = mhlo.maximum %80, %5 : tensor<1x10xf32>
%82 = "mhlo.select"(%59, %47, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%56) : (tensor<1x1xf32>) -> tensor<1xf32>
%84 = "mhlo.broadcast_in_dim"(%83) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%85 = "mhlo.compare"(%84, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%86 = "mhlo.slice"(%64) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%87 = mhlo.multiply %86, %7 : tensor<1x10xf32>
%88 = mhlo.tanh %87 : tensor<1x10xf32>
%89 = mhlo.multiply %88, %7 : tensor<1x10xf32>
%90 = mhlo.add %89, %7 : tensor<1x10xf32>
%91 = mhlo.tanh %81 : tensor<1x10xf32>
%92 = mhlo.multiply %90, %91 : tensor<1x10xf32>
%93 = "mhlo.select"(%85, %48, %92) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%94 = "mhlo.reshape"(%46) : (tensor<i64>) -> tensor<1xi64>
%95 = "mhlo.reshape"(%41) : (tensor<i64>) -> tensor<1xi64>
%96 = mhlo.convert(%95) : (tensor<1xi64>) -> tensor<1xi32>
%97 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%98 = "mhlo.dynamic-update-slice"(%52, %94, %97) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%99 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%100 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%101 = "mhlo.dynamic-update-slice"(%53, %99, %100, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%102 = "mhlo.reshape"(%93) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%103 = "mhlo.reshape"(%96) : (tensor<1xi32>) -> tensor<i32>
%104 = "mhlo.dynamic-update-slice"(%54, %102, %103, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%55, %42, %43, %44, %45, %46, %82, %93, %49, %50, %51, %98, %101, %104 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<i64>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%105: tensor<i64>, %106: tensor<i64>, %107: tensor<1x10xf32>, %108: tensor<1x10xf32>, %109: tensor<5xi64>, %110: tensor<5x1x10xf32>, %111: tensor<5x1x10xf32>): // pred: ^bb1
return %105, %109, %110, %111, %106, %107, %108 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xf32>, %arg2: tensor<5x1x64xf32>, %arg3: tensor<5x1x1xf32>, %arg4: tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>) {
%cst = arith.constant dense<0x7F800000> : tensor<f32>
%0 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_1 = arith.constant dense<-2147483648> : tensor<i32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_3 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_4 = arith.constant dense<0> : tensor<i64>
%2 = mhlo.constant dense<0> : tensor<5xi64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i64>
%4 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%5 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%7 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0> : tensor<i32>
%8 = mhlo.reduce(%arg3 init: %cst) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg5: tensor<f32>, %arg6: tensor<f32>) {
%108 = mhlo.minimum %arg5, %arg6 : tensor<f32>
"mhlo.return"(%108) : (tensor<f32>) -> ()
}
%9 = "mhlo.compare"(%8, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%10 = mhlo.convert(%9) : (tensor<5xi1>) -> tensor<5xi32>
%11 = mhlo.multiply %10, %cst_0 : tensor<5xi32>
%12 = mhlo.reduce(%11 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%108 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%108) : (tensor<i32>) -> ()
}
%13 = mhlo.subtract %cst_2, %12 : tensor<i32>
%14 = "mhlo.compare"(%13, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%15 = "mhlo.reverse"(%8) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%16 = "mhlo.compare"(%15, %0) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%17 = mhlo.convert(%16) : (tensor<5xi1>) -> tensor<5xi32>
%18 = mhlo.multiply %17, %cst_0 : tensor<5xi32>
%19 = mhlo.reduce(%18 init: %cst_1) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg5: tensor<i32>, %arg6: tensor<i32>) {
%108 = mhlo.maximum %arg5, %arg6 : tensor<i32>
"mhlo.return"(%108) : (tensor<i32>) -> ()
}
%20 = mhlo.subtract %cst_2, %19 : tensor<i32>
%21 = "mhlo.select"(%14, %cst_6, %20) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = mhlo.convert(%21) : (tensor<i32>) -> tensor<i64>
%23 = mhlo.subtract %cst_2, %13 : tensor<i32>
%24 = mhlo.convert(%23) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%22, %24, %1, %cst_3, %cst_4, %arg0, %arg1, %arg2, %arg3, %2, %3, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb1(%25: tensor<i64>, %26: tensor<i64>, %27: tensor<40xf32>, %28: tensor<74x40xf32>, %29: tensor<i64>, %30: tensor<1x10xf32>, %31: tensor<1x10xf32>, %32: tensor<5x1x64xf32>, %33: tensor<5x1x1xf32>, %34: tensor<5xi64>, %35: tensor<5x1x10xf32>, %36: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%37 = "mhlo.compare"(%25, %26) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
%38 = tensor.extract %37[] : tensor<i1>
cf.cond_br %38, ^bb2(%25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>), ^bb3(%25, %29, %30, %31, %34, %35, %36 : tensor<i64>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb2(%39: tensor<i64>, %40: tensor<i64>, %41: tensor<40xf32>, %42: tensor<74x40xf32>, %43: tensor<i64>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5xi64>, %49: tensor<5x1x10xf32>, %50: tensor<5x1x10xf32>): // pred: ^bb1
%51 = mhlo.add %39, %cst_5 : tensor<i64>
%52 = "mhlo.torch_index_select"(%47, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%53 = "mhlo.reshape"(%52) : (tensor<1x1xf32>) -> tensor<1xf32>
%54 = "mhlo.broadcast_in_dim"(%53) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%55 = "mhlo.compare"(%54, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%56 = "mhlo.torch_index_select"(%46, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%57 = "mhlo.concatenate"(%56, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%58 = "mhlo.dot"(%57, %42) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%59 = "mhlo.reshape"(%41) : (tensor<40xf32>) -> tensor<1x40xf32>
%60 = mhlo.add %58, %59 : tensor<1x40xf32>
%61 = "mhlo.slice"(%60) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%62 = mhlo.multiply %61, %7 : tensor<1x10xf32>
%63 = mhlo.tanh %62 : tensor<1x10xf32>
%64 = mhlo.multiply %63, %7 : tensor<1x10xf32>
%65 = mhlo.add %64, %7 : tensor<1x10xf32>
%66 = mhlo.multiply %65, %44 : tensor<1x10xf32>
%67 = "mhlo.slice"(%60) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%68 = mhlo.multiply %67, %7 : tensor<1x10xf32>
%69 = mhlo.tanh %68 : tensor<1x10xf32>
%70 = mhlo.multiply %69, %7 : tensor<1x10xf32>
%71 = mhlo.add %70, %7 : tensor<1x10xf32>
%72 = "mhlo.slice"(%60) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%73 = mhlo.tanh %72 : tensor<1x10xf32>
%74 = mhlo.multiply %71, %73 : tensor<1x10xf32>
%75 = mhlo.add %66, %74 : tensor<1x10xf32>
%76 = mhlo.minimum %75, %4 : tensor<1x10xf32>
%77 = mhlo.maximum %76, %5 : tensor<1x10xf32>
%78 = "mhlo.select"(%55, %44, %77) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%79 = "mhlo.reshape"(%52) : (tensor<1x1xf32>) -> tensor<1xf32>
%80 = "mhlo.broadcast_in_dim"(%79) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%81 = "mhlo.compare"(%80, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%82 = "mhlo.slice"(%60) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%83 = mhlo.multiply %82, %7 : tensor<1x10xf32>
%84 = mhlo.tanh %83 : tensor<1x10xf32>
%85 = mhlo.multiply %84, %7 : tensor<1x10xf32>
%86 = mhlo.add %85, %7 : tensor<1x10xf32>
%87 = mhlo.tanh %77 : tensor<1x10xf32>
%88 = mhlo.multiply %86, %87 : tensor<1x10xf32>
%89 = "mhlo.select"(%81, %45, %88) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%90 = "mhlo.reshape"(%43) : (tensor<i64>) -> tensor<1xi64>
%91 = "mhlo.reshape"(%39) : (tensor<i64>) -> tensor<1xi64>
%92 = mhlo.convert(%91) : (tensor<1xi64>) -> tensor<1xi32>
%93 = "mhlo.reshape"(%92) : (tensor<1xi32>) -> tensor<i32>
%94 = "mhlo.dynamic-update-slice"(%48, %90, %93) : (tensor<5xi64>, tensor<1xi64>, tensor<i32>) -> tensor<5xi64>
%95 = "mhlo.reshape"(%78) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%96 = "mhlo.reshape"(%92) : (tensor<1xi32>) -> tensor<i32>
%97 = "mhlo.dynamic-update-slice"(%49, %95, %96, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
%98 = "mhlo.reshape"(%89) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%99 = "mhlo.reshape"(%92) : (tensor<1xi32>) -> tensor<i32>
%100 = "mhlo.dynamic-update-slice"(%50, %98, %99, %cst_6, %cst_6) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%51, %40, %41, %42, %43, %78, %89, %46, %47, %94, %97, %100 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>)
^bb3(%101: tensor<i64>, %102: tensor<i64>, %103: tensor<1x10xf32>, %104: tensor<1x10xf32>, %105: tensor<5xi64>, %106: tensor<5x1x10xf32>, %107: tensor<5x1x10xf32>): // pred: ^bb1
return %101, %105, %106, %107, %102, %103, %104 : tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%0 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst = arith.constant dense<0.000000e+00> : tensor<f32>
%1 = mhlo.constant dense<0.000000e+00> : tensor<5x1x1xf32>
%2 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%3 = "mhlo.pad"(%2, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%4 = "mhlo.transpose"(%3) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%5 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%6 = "mhlo.reshape"(%5) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%7:7 = call @Forward_o16DF3vQKaI__disable_call_shape_inference_true_.189(%0, %0, %4, %6, %1) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x1xf32>) -> (tensor<i64>, tensor<5xi64>, tensor<5x1x10xf32>, tensor<5x1x10xf32>, tensor<i64>, tensor<1x10xf32>, tensor<1x10xf32>)
return %7#3 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%94) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
%22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
%23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%24 = mhlo.subtract %cst_2, %23 : tensor<i32>
%25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%26 = mhlo.convert(%25) : (tensor<i32>) -> tensor<i64>
%27 = mhlo.subtract %cst_2, %17 : tensor<i32>
%28 = mhlo.convert(%27) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%29: tensor<i64>, %30: tensor<i64>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
%39 = tensor.extract %38[] : tensor<i1>
cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
^bb2(%40: tensor<i64>, %41: tensor<i64>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // pred: ^bb1
%49 = mhlo.add %40, %cst_0 : tensor<i64>
%50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
%58 = mhlo.add %56, %57 : tensor<1x40xf32>
%59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
%61 = mhlo.tanh %60 : tensor<1x10xf32>
%62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
%63 = mhlo.add %62, %0 : tensor<1x10xf32>
%64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
%65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
%69 = mhlo.add %68, %0 : tensor<1x10xf32>
%70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%71 = mhlo.tanh %70 : tensor<1x10xf32>
%72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
%73 = mhlo.add %64, %72 : tensor<1x10xf32>
%74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
%75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
%76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
%82 = mhlo.tanh %81 : tensor<1x10xf32>
%83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
%84 = mhlo.add %83, %0 : tensor<1x10xf32>
%85 = mhlo.tanh %75 : tensor<1x10xf32>
%86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
%87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%88 = "mhlo.reshape"(%40) : (tensor<i64>) -> tensor<1xi64>
%89 = mhlo.convert(%88) : (tensor<1xi64>) -> tensor<1xi32>
%90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
%92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%93: tensor<5x1x10xf32>): // pred: ^bb1
return %93 : tensor<5x1x10xf32>
}
// -----// IR Dump After Inliner //----- //
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i64>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%94) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
%22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
%23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%24 = mhlo.subtract %cst_2, %23 : tensor<i32>
%25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%26 = mhlo.convert(%25) : (tensor<i32>) -> tensor<i64>
%27 = mhlo.subtract %cst_2, %17 : tensor<i32>
%28 = mhlo.convert(%27) : (tensor<i32>) -> tensor<i64>
cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%29: tensor<i64>, %30: tensor<i64>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
%39 = tensor.extract %38[] : tensor<i1>
cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
^bb2(%40: tensor<i64>, %41: tensor<i64>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // pred: ^bb1
%49 = mhlo.add %40, %cst_0 : tensor<i64>
%50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i64>) -> tensor<1x1xf32>
%51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i64>) -> tensor<1x64xf32>
%55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
%58 = mhlo.add %56, %57 : tensor<1x40xf32>
%59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
%61 = mhlo.tanh %60 : tensor<1x10xf32>
%62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
%63 = mhlo.add %62, %0 : tensor<1x10xf32>
%64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
%65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
%69 = mhlo.add %68, %0 : tensor<1x10xf32>
%70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%71 = mhlo.tanh %70 : tensor<1x10xf32>
%72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
%73 = mhlo.add %64, %72 : tensor<1x10xf32>
%74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
%75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
%76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
%82 = mhlo.tanh %81 : tensor<1x10xf32>
%83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
%84 = mhlo.add %83, %0 : tensor<1x10xf32>
%85 = mhlo.tanh %75 : tensor<1x10xf32>
%86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
%87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%88 = "mhlo.reshape"(%40) : (tensor<i64>) -> tensor<1xi64>
%89 = mhlo.convert(%88) : (tensor<1xi64>) -> tensor<1xi32>
%90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
%92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i64>, tensor<i64>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%93: tensor<5x1x10xf32>): // pred: ^bb1
return %93 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteI64ToI32Pass //----- //
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i32>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%94) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
%22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
%23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%24 = mhlo.subtract %cst_2, %23 : tensor<i32>
%25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%26 = mhlo.convert %25 : tensor<i32>
%27 = mhlo.subtract %cst_2, %17 : tensor<i32>
%28 = mhlo.convert %27 : tensor<i32>
cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%29: tensor<i32>, %30: tensor<i32>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%39 = tensor.extract %38[] : tensor<i1>
cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
^bb2(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // pred: ^bb1
%49 = mhlo.add %40, %cst_0 : tensor<i32>
%50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
%51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
%55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
%58 = mhlo.add %56, %57 : tensor<1x40xf32>
%59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
%61 = mhlo.tanh %60 : tensor<1x10xf32>
%62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
%63 = mhlo.add %62, %0 : tensor<1x10xf32>
%64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
%65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
%69 = mhlo.add %68, %0 : tensor<1x10xf32>
%70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%71 = mhlo.tanh %70 : tensor<1x10xf32>
%72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
%73 = mhlo.add %64, %72 : tensor<1x10xf32>
%74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
%75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
%76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
%82 = mhlo.tanh %81 : tensor<1x10xf32>
%83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
%84 = mhlo.add %83, %0 : tensor<1x10xf32>
%85 = mhlo.tanh %75 : tensor<1x10xf32>
%86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
%87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%88 = "mhlo.reshape"(%40) : (tensor<i32>) -> tensor<1xi32>
%89 = mhlo.convert %88 : tensor<1xi32>
%90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
%92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%93: tensor<5x1x10xf32>): // pred: ^bb1
return %93 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i32>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%94 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%94) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
%22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
%23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%94 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%94) : (tensor<i32>) -> ()
}
%24 = mhlo.subtract %cst_2, %23 : tensor<i32>
%25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%26 = mhlo.convert %25 : tensor<i32>
%27 = mhlo.subtract %cst_2, %17 : tensor<i32>
%28 = mhlo.convert %27 : tensor<i32>
cf.br ^bb1(%26, %28, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%29: tensor<i32>, %30: tensor<i32>, %31: tensor<40xf32>, %32: tensor<74x40xf32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x64xf32>, %36: tensor<5x1x1xf32>, %37: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%38 = "mhlo.compare"(%29, %30) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%39 = tensor.extract %38[] : tensor<i1>
cf.cond_br %39, ^bb2(%29, %30, %31, %32, %33, %34, %35, %36, %37 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%37 : tensor<5x1x10xf32>)
^bb2(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // pred: ^bb1
%49 = mhlo.add %40, %cst_0 : tensor<i32>
%50 = "mhlo.torch_index_select"(%47, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
%51 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%52 = "mhlo.broadcast_in_dim"(%51) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%53 = "mhlo.compare"(%52, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%54 = "mhlo.torch_index_select"(%46, %40) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
%55 = "mhlo.concatenate"(%54, %45) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%56 = "mhlo.dot"(%55, %43) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%57 = "mhlo.reshape"(%42) : (tensor<40xf32>) -> tensor<1x40xf32>
%58 = mhlo.add %56, %57 : tensor<1x40xf32>
%59 = "mhlo.slice"(%58) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
%61 = mhlo.tanh %60 : tensor<1x10xf32>
%62 = mhlo.multiply %61, %0 : tensor<1x10xf32>
%63 = mhlo.add %62, %0 : tensor<1x10xf32>
%64 = mhlo.multiply %63, %44 : tensor<1x10xf32>
%65 = "mhlo.slice"(%58) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
%67 = mhlo.tanh %66 : tensor<1x10xf32>
%68 = mhlo.multiply %67, %0 : tensor<1x10xf32>
%69 = mhlo.add %68, %0 : tensor<1x10xf32>
%70 = "mhlo.slice"(%58) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%71 = mhlo.tanh %70 : tensor<1x10xf32>
%72 = mhlo.multiply %69, %71 : tensor<1x10xf32>
%73 = mhlo.add %64, %72 : tensor<1x10xf32>
%74 = mhlo.minimum %73, %2 : tensor<1x10xf32>
%75 = mhlo.maximum %74, %1 : tensor<1x10xf32>
%76 = "mhlo.select"(%53, %44, %75) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%77 = "mhlo.reshape"(%50) : (tensor<1x1xf32>) -> tensor<1xf32>
%78 = "mhlo.broadcast_in_dim"(%77) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%79 = "mhlo.compare"(%78, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%80 = "mhlo.slice"(%58) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
%82 = mhlo.tanh %81 : tensor<1x10xf32>
%83 = mhlo.multiply %82, %0 : tensor<1x10xf32>
%84 = mhlo.add %83, %0 : tensor<1x10xf32>
%85 = mhlo.tanh %75 : tensor<1x10xf32>
%86 = mhlo.multiply %84, %85 : tensor<1x10xf32>
%87 = "mhlo.select"(%79, %45, %86) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%88 = "mhlo.reshape"(%40) : (tensor<i32>) -> tensor<1xi32>
%89 = mhlo.convert %88 : tensor<1xi32>
%90 = "mhlo.reshape"(%87) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%91 = "mhlo.reshape"(%89) : (tensor<1xi32>) -> tensor<i32>
%92 = "mhlo.dynamic-update-slice"(%48, %90, %91, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%49, %41, %42, %43, %76, %87, %46, %47, %92 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%93: tensor<5x1x10xf32>): // pred: ^bb1
return %93 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i32>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%89 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%89) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%89 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%89) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
%22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
%23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%89 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%89) : (tensor<i32>) -> ()
}
%24 = mhlo.subtract %cst_2, %23 : tensor<i32>
%25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%26 = mhlo.subtract %cst_2, %17 : tensor<i32>
cf.br ^bb1(%25, %26, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%27: tensor<i32>, %28: tensor<i32>, %29: tensor<40xf32>, %30: tensor<74x40xf32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%36 = "mhlo.compare"(%27, %28) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%37 = tensor.extract %36[] : tensor<i1>
cf.cond_br %37, ^bb2(%27, %28, %29, %30, %31, %32, %33, %34, %35 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%35 : tensor<5x1x10xf32>)
^bb2(%38: tensor<i32>, %39: tensor<i32>, %40: tensor<40xf32>, %41: tensor<74x40xf32>, %42: tensor<1x10xf32>, %43: tensor<1x10xf32>, %44: tensor<5x1x64xf32>, %45: tensor<5x1x1xf32>, %46: tensor<5x1x10xf32>): // pred: ^bb1
%47 = mhlo.add %38, %cst_0 : tensor<i32>
%48 = "mhlo.torch_index_select"(%45, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
%49 = "mhlo.reshape"(%48) : (tensor<1x1xf32>) -> tensor<1xf32>
%50 = "mhlo.broadcast_in_dim"(%49) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%51 = "mhlo.compare"(%50, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%52 = "mhlo.torch_index_select"(%44, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
%53 = "mhlo.concatenate"(%52, %43) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%54 = "mhlo.dot"(%53, %41) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%55 = "mhlo.reshape"(%40) : (tensor<40xf32>) -> tensor<1x40xf32>
%56 = mhlo.add %54, %55 : tensor<1x40xf32>
%57 = "mhlo.slice"(%56) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%58 = mhlo.multiply %57, %0 : tensor<1x10xf32>
%59 = mhlo.tanh %58 : tensor<1x10xf32>
%60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
%61 = mhlo.add %60, %0 : tensor<1x10xf32>
%62 = mhlo.multiply %61, %42 : tensor<1x10xf32>
%63 = "mhlo.slice"(%56) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%64 = mhlo.multiply %63, %0 : tensor<1x10xf32>
%65 = mhlo.tanh %64 : tensor<1x10xf32>
%66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
%67 = mhlo.add %66, %0 : tensor<1x10xf32>
%68 = "mhlo.slice"(%56) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%69 = mhlo.tanh %68 : tensor<1x10xf32>
%70 = mhlo.multiply %67, %69 : tensor<1x10xf32>
%71 = mhlo.add %62, %70 : tensor<1x10xf32>
%72 = mhlo.minimum %71, %2 : tensor<1x10xf32>
%73 = mhlo.maximum %72, %1 : tensor<1x10xf32>
%74 = "mhlo.select"(%51, %42, %73) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%75 = "mhlo.reshape"(%48) : (tensor<1x1xf32>) -> tensor<1xf32>
%76 = "mhlo.broadcast_in_dim"(%75) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%77 = "mhlo.compare"(%76, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%78 = "mhlo.slice"(%56) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%79 = mhlo.multiply %78, %0 : tensor<1x10xf32>
%80 = mhlo.tanh %79 : tensor<1x10xf32>
%81 = mhlo.multiply %80, %0 : tensor<1x10xf32>
%82 = mhlo.add %81, %0 : tensor<1x10xf32>
%83 = mhlo.tanh %73 : tensor<1x10xf32>
%84 = mhlo.multiply %82, %83 : tensor<1x10xf32>
%85 = "mhlo.select"(%77, %43, %84) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%86 = "mhlo.reshape"(%85) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%87 = "mhlo.dynamic-update-slice"(%46, %86, %38, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%47, %39, %40, %41, %74, %85, %44, %45, %87 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%88: tensor<5x1x10xf32>): // pred: ^bb1
return %88 : tensor<5x1x10xf32>
}
// -----// IR Dump After CSE //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i32>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%86 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%86) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%86 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%86) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = "mhlo.reverse"(%12) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
%20 = "mhlo.compare"(%19, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%21 = mhlo.convert(%20) : (tensor<5xi1>) -> tensor<5xi32>
%22 = mhlo.multiply %21, %cst_4 : tensor<5xi32>
%23 = mhlo.reduce(%22 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%86 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%86) : (tensor<i32>) -> ()
}
%24 = mhlo.subtract %cst_2, %23 : tensor<i32>
%25 = "mhlo.select"(%18, %cst, %24) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%26 = mhlo.subtract %cst_2, %17 : tensor<i32>
cf.br ^bb1(%25, %26, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%27: tensor<i32>, %28: tensor<i32>, %29: tensor<40xf32>, %30: tensor<74x40xf32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x64xf32>, %34: tensor<5x1x1xf32>, %35: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%36 = "mhlo.compare"(%27, %28) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%37 = tensor.extract %36[] : tensor<i1>
cf.cond_br %37, ^bb2(%27, %28, %29, %30, %31, %32, %33, %34, %35 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%35 : tensor<5x1x10xf32>)
^bb2(%38: tensor<i32>, %39: tensor<i32>, %40: tensor<40xf32>, %41: tensor<74x40xf32>, %42: tensor<1x10xf32>, %43: tensor<1x10xf32>, %44: tensor<5x1x64xf32>, %45: tensor<5x1x1xf32>, %46: tensor<5x1x10xf32>): // pred: ^bb1
%47 = mhlo.add %38, %cst_0 : tensor<i32>
%48 = "mhlo.torch_index_select"(%45, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
%49 = "mhlo.reshape"(%48) : (tensor<1x1xf32>) -> tensor<1xf32>
%50 = "mhlo.broadcast_in_dim"(%49) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%51 = "mhlo.compare"(%50, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%52 = "mhlo.torch_index_select"(%44, %38) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
%53 = "mhlo.concatenate"(%52, %43) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%54 = "mhlo.dot"(%53, %41) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%55 = "mhlo.reshape"(%40) : (tensor<40xf32>) -> tensor<1x40xf32>
%56 = mhlo.add %54, %55 : tensor<1x40xf32>
%57 = "mhlo.slice"(%56) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%58 = mhlo.multiply %57, %0 : tensor<1x10xf32>
%59 = mhlo.tanh %58 : tensor<1x10xf32>
%60 = mhlo.multiply %59, %0 : tensor<1x10xf32>
%61 = mhlo.add %60, %0 : tensor<1x10xf32>
%62 = mhlo.multiply %61, %42 : tensor<1x10xf32>
%63 = "mhlo.slice"(%56) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%64 = mhlo.multiply %63, %0 : tensor<1x10xf32>
%65 = mhlo.tanh %64 : tensor<1x10xf32>
%66 = mhlo.multiply %65, %0 : tensor<1x10xf32>
%67 = mhlo.add %66, %0 : tensor<1x10xf32>
%68 = "mhlo.slice"(%56) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%69 = mhlo.tanh %68 : tensor<1x10xf32>
%70 = mhlo.multiply %67, %69 : tensor<1x10xf32>
%71 = mhlo.add %62, %70 : tensor<1x10xf32>
%72 = mhlo.minimum %71, %2 : tensor<1x10xf32>
%73 = mhlo.maximum %72, %1 : tensor<1x10xf32>
%74 = "mhlo.select"(%51, %42, %73) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%75 = "mhlo.slice"(%56) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%76 = mhlo.multiply %75, %0 : tensor<1x10xf32>
%77 = mhlo.tanh %76 : tensor<1x10xf32>
%78 = mhlo.multiply %77, %0 : tensor<1x10xf32>
%79 = mhlo.add %78, %0 : tensor<1x10xf32>
%80 = mhlo.tanh %73 : tensor<1x10xf32>
%81 = mhlo.multiply %79, %80 : tensor<1x10xf32>
%82 = "mhlo.select"(%51, %43, %81) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%83 = "mhlo.reshape"(%82) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%84 = "mhlo.dynamic-update-slice"(%46, %83, %38, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%47, %39, %40, %41, %74, %82, %44, %45, %84 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%85: tensor<5x1x10xf32>): // pred: ^bb1
return %85 : tensor<5x1x10xf32>
}
// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%0 = mhlo.constant dense<5.000000e-01> : tensor<1x10xf32>
%1 = mhlo.constant dense<-1.000000e+01> : tensor<1x10xf32>
%2 = mhlo.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_0 = arith.constant dense<1> : tensor<i32>
%3 = mhlo.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_1 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%4 = mhlo.constant dense<0.000000e+00> : tensor<40xf32>
%cst_2 = arith.constant dense<5> : tensor<i32>
%cst_3 = arith.constant dense<-2147483648> : tensor<i32>
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%5 = mhlo.constant dense<0.000000e+00> : tensor<5xf32>
%cst_5 = arith.constant dense<0x7F800000> : tensor<f32>
%6 = mhlo.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<f32>
%7 = "mhlo.reshape"(%arg1) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
%8 = "mhlo.pad"(%7, %cst_6) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
%9 = "mhlo.transpose"(%8) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
%10 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
%11 = "mhlo.reshape"(%10) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
%12 = mhlo.reduce(%11 init: %cst_5) across dimensions = [1, 2] : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
reducer(%arg2: tensor<f32>, %arg3: tensor<f32>) {
%87 = mhlo.minimum %arg2, %arg3 : tensor<f32>
"mhlo.return"(%87) : (tensor<f32>) -> ()
}
%13 = "mhlo.compare"(%12, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%14 = mhlo.convert(%13) : (tensor<5xi1>) -> tensor<5xi32>
%15 = mhlo.multiply %14, %cst_4 : tensor<5xi32>
%16 = mhlo.reduce(%15 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%87 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%87) : (tensor<i32>) -> ()
}
%17 = mhlo.subtract %cst_2, %16 : tensor<i32>
%18 = "mhlo.compare"(%17, %cst_2) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%19 = linalg.init_tensor [5] : tensor<5xf32>
%20 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%19 : tensor<5xf32>) : tensor<5xf32>
%21 = "mhlo.compare"(%20, %5) {comparison_direction = #mhlo<"comparison_direction EQ">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
%22 = mhlo.convert(%21) : (tensor<5xi1>) -> tensor<5xi32>
%23 = mhlo.multiply %22, %cst_4 : tensor<5xi32>
%24 = mhlo.reduce(%23 init: %cst_3) across dimensions = [0] : (tensor<5xi32>, tensor<i32>) -> tensor<i32>
reducer(%arg2: tensor<i32>, %arg3: tensor<i32>) {
%87 = mhlo.maximum %arg2, %arg3 : tensor<i32>
"mhlo.return"(%87) : (tensor<i32>) -> ()
}
%25 = mhlo.subtract %cst_2, %24 : tensor<i32>
%26 = "mhlo.select"(%18, %cst, %25) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
%27 = mhlo.subtract %cst_2, %17 : tensor<i32>
cf.br ^bb1(%26, %27, %4, %cst_1, %6, %6, %9, %11, %3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%28: tensor<i32>, %29: tensor<i32>, %30: tensor<40xf32>, %31: tensor<74x40xf32>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x64xf32>, %35: tensor<5x1x1xf32>, %36: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%37 = "mhlo.compare"(%28, %29) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%38 = tensor.extract %37[] : tensor<i1>
cf.cond_br %38, ^bb2(%28, %29, %30, %31, %32, %33, %34, %35, %36 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%36 : tensor<5x1x10xf32>)
^bb2(%39: tensor<i32>, %40: tensor<i32>, %41: tensor<40xf32>, %42: tensor<74x40xf32>, %43: tensor<1x10xf32>, %44: tensor<1x10xf32>, %45: tensor<5x1x64xf32>, %46: tensor<5x1x1xf32>, %47: tensor<5x1x10xf32>): // pred: ^bb1
%48 = mhlo.add %39, %cst_0 : tensor<i32>
%49 = "mhlo.torch_index_select"(%46, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x1xf32>
%50 = "mhlo.reshape"(%49) : (tensor<1x1xf32>) -> tensor<1xf32>
%51 = "mhlo.broadcast_in_dim"(%50) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xf32>) -> tensor<1x10xf32>
%52 = "mhlo.compare"(%51, %6) {comparison_direction = #mhlo<"comparison_direction GT">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xi1>
%53 = "mhlo.torch_index_select"(%45, %39) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
%54 = "mhlo.concatenate"(%53, %44) {dimension = 1 : i64} : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
%55 = "mhlo.dot"(%54, %42) {precision_config = [#mhlo<"precision DEFAULT">, #mhlo<"precision DEFAULT">]} : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
%56 = "mhlo.reshape"(%41) : (tensor<40xf32>) -> tensor<1x40xf32>
%57 = mhlo.add %55, %56 : tensor<1x40xf32>
%58 = "mhlo.slice"(%57) {limit_indices = dense<[1, 30]> : tensor<2xi64>, start_indices = dense<[0, 20]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%59 = mhlo.multiply %58, %0 : tensor<1x10xf32>
%60 = mhlo.tanh %59 : tensor<1x10xf32>
%61 = mhlo.multiply %60, %0 : tensor<1x10xf32>
%62 = mhlo.add %61, %0 : tensor<1x10xf32>
%63 = mhlo.multiply %62, %43 : tensor<1x10xf32>
%64 = "mhlo.slice"(%57) {limit_indices = dense<[1, 20]> : tensor<2xi64>, start_indices = dense<[0, 10]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%65 = mhlo.multiply %64, %0 : tensor<1x10xf32>
%66 = mhlo.tanh %65 : tensor<1x10xf32>
%67 = mhlo.multiply %66, %0 : tensor<1x10xf32>
%68 = mhlo.add %67, %0 : tensor<1x10xf32>
%69 = "mhlo.slice"(%57) {limit_indices = dense<[1, 10]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%70 = mhlo.tanh %69 : tensor<1x10xf32>
%71 = mhlo.multiply %68, %70 : tensor<1x10xf32>
%72 = mhlo.add %63, %71 : tensor<1x10xf32>
%73 = mhlo.minimum %72, %2 : tensor<1x10xf32>
%74 = mhlo.maximum %73, %1 : tensor<1x10xf32>
%75 = "mhlo.select"(%52, %43, %74) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%76 = "mhlo.slice"(%57) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
%77 = mhlo.multiply %76, %0 : tensor<1x10xf32>
%78 = mhlo.tanh %77 : tensor<1x10xf32>
%79 = mhlo.multiply %78, %0 : tensor<1x10xf32>
%80 = mhlo.add %79, %0 : tensor<1x10xf32>
%81 = mhlo.tanh %74 : tensor<1x10xf32>
%82 = mhlo.multiply %80, %81 : tensor<1x10xf32>
%83 = "mhlo.select"(%52, %44, %82) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
%84 = "mhlo.reshape"(%83) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
%85 = "mhlo.dynamic-update-slice"(%47, %84, %39, %cst, %cst) : (tensor<5x1x10xf32>, tensor<1x1x10xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x10xf32>
cf.br ^bb1(%48, %40, %41, %42, %75, %83, %45, %46, %85 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%86: tensor<5x1x10xf32>): // pred: ^bb1
return %86 : tensor<5x1x10xf32>
}
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%cst_0 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_1 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_2 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<1> : tensor<i32>
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_7 = arith.constant dense<5> : tensor<i32>
%cst_8 = arith.constant dense<-2147483648> : tensor<i32>
%cst_9 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_10 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_11 = arith.constant dense<0x7F800000> : tensor<f32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<f32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%cst_14 = arith.constant 0.000000e+00 : f32
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_14 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%cst_15 = arith.constant 0x7F800000 : f32
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_15 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = arith.minf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%148 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %148 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%148 = arith.extui %arg2 : i1 to i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.muli %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%c-2147483648_i32 = arith.constant -2147483648 : i32
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%148 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_7, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.subi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_7 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%148 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %148 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%148 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %148 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%148 = arith.extui %arg2 : i1 to i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.muli %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%c-2147483648_i32_16 = arith.constant -2147483648 : i32
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32_16 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%148 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_7, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.subi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%148 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_7, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.subi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_6, %cst_5, %cst_12, %cst_12, %3, %6, %cst_4 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%148 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %148 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_3 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.addi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52, %63 : tensor<i32>, tensor<1x1xf32>) outs(%64 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
%148 = arith.index_cast %arg2 : i32 to index
%149 = linalg.index 0 : index
%150 = linalg.index 1 : index
%151 = tensor.extract %59[%148, %149, %150] : tensor<5x1x1xf32>
linalg.yield %151 : f32
} -> tensor<1x1xf32>
%66 = tensor.collapse_shape %65 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%67 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1xf32>) outs(%67 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%69 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%69 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%148 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %148 : i1
} -> tensor<1x10xi1>
%71 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%72 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52, %71 : tensor<i32>, tensor<1x64xf32>) outs(%72 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
%148 = arith.index_cast %arg2 : i32 to index
%149 = linalg.index 0 : index
%150 = linalg.index 1 : index
%151 = tensor.extract %58[%148, %149, %150] : tensor<5x1x64xf32>
linalg.yield %151 : f32
} -> tensor<1x64xf32>
%c0 = arith.constant 0 : index
%c0_17 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c1_18 = arith.constant 1 : index
%c0_19 = arith.constant 0 : index
%c1_20 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c1_21 = arith.constant 1 : index
%c0_22 = arith.constant 0 : index
%c1_23 = arith.constant 1 : index
%c64_24 = arith.constant 64 : index
%c64_25 = arith.constant 64 : index
%c1_26 = arith.constant 1 : index
%c10 = arith.constant 10 : index
%c74 = arith.constant 74 : index
%74 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%c0_27 = arith.constant 0 : index
%c1_28 = arith.constant 1 : index
%c64_29 = arith.constant 64 : index
%75 = tensor.insert_slice %73 into %74[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%76 = arith.addi %c0_27, %c64_29 : index
%c1_30 = arith.constant 1 : index
%c10_31 = arith.constant 10 : index
%77 = tensor.insert_slice %57 into %75[0, %76] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%78 = arith.addi %76, %c10_31 : index
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%cst_32 = arith.constant 0.000000e+00 : f32
%80 = linalg.fill ins(%cst_32 : f32) outs(%79 : tensor<1x40xf32>) -> tensor<1x40xf32>
%81 = linalg.matmul ins(%77, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%80 : tensor<1x40xf32>) -> tensor<1x40xf32>
%82 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%83 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %82 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%83 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x40xf32>
%85 = tensor.extract_slice %84[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%92 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%92 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%94 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%95 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%93, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%94 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%96 = tensor.extract_slice %84[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98 : tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%101 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%102 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%101 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%103 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%104 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%102, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%103 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%105 = tensor.extract_slice %84[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%105 : tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%104, %107 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%95, %109 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.minf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%114 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%115 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%113, %cst_1 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%114 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.maxf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%116 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%117 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %56, %115 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%116 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%148 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%118 = tensor.extract_slice %84[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120 : tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%124, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%115 : tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%129 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%130 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%126, %128 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%129 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%131 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%132 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %57, %130 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%131 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%148 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%133 = tensor.expand_shape %132 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%c0_i32 = arith.constant 0 : i32
%134 = tensor.extract %52[] : tensor<i32>
%c4_i32 = arith.constant 4 : i32
%135 = arith.maxsi %c0_i32, %134 : i32
%136 = arith.minsi %135, %c4_i32 : i32
%137 = arith.index_cast %136 : i32 to index
%138 = tensor.extract %cst[] : tensor<i32>
%c0_i32_33 = arith.constant 0 : i32
%139 = arith.maxsi %c0_i32, %138 : i32
%140 = arith.minsi %139, %c0_i32_33 : i32
%141 = arith.index_cast %140 : i32 to index
%142 = tensor.extract %cst[] : tensor<i32>
%c0_i32_34 = arith.constant 0 : i32
%143 = arith.maxsi %c0_i32, %142 : i32
%144 = arith.minsi %143, %c0_i32_34 : i32
%145 = arith.index_cast %144 : i32 to index
%146 = tensor.insert_slice %133 into %60[%137, %141, %145] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %117, %132, %58, %59, %146 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%147: tensor<5x1x10xf32>): // pred: ^bb1
return %147 : tensor<5x1x10xf32>
}
// -----// IR Dump After ReconcileUnrealizedCasts //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0> : tensor<i32>
%cst_0 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_1 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_2 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<1> : tensor<i32>
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_7 = arith.constant dense<5> : tensor<i32>
%cst_8 = arith.constant dense<-2147483648> : tensor<i32>
%cst_9 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_10 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_11 = arith.constant dense<0x7F800000> : tensor<f32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<f32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%cst_14 = arith.constant 0.000000e+00 : f32
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_14 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%cst_15 = arith.constant 0x7F800000 : f32
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_15 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = arith.minf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%148 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %148 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%148 = arith.extui %arg2 : i1 to i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.muli %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%c-2147483648_i32 = arith.constant -2147483648 : i32
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%148 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_7, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.subi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_7 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%148 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %148 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_10 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%148 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %148 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%148 = arith.extui %arg2 : i1 to i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_9 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.muli %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<5xi32>
%c-2147483648_i32_16 = arith.constant -2147483648 : i32
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32_16 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%148 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_7, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.subi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%148 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_7, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.subi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_6, %cst_5, %cst_12, %cst_12, %3, %6, %cst_4 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%148 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %148 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_3 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%148 = arith.addi %arg2, %arg3 : i32
linalg.yield %148 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%65 = linalg.generic {indexing_maps = [#map8, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%52, %63 : tensor<i32>, tensor<1x1xf32>) outs(%64 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
%148 = arith.index_cast %arg2 : i32 to index
%149 = linalg.index 0 : index
%150 = linalg.index 1 : index
%151 = tensor.extract %59[%148, %149, %150] : tensor<5x1x1xf32>
linalg.yield %151 : f32
} -> tensor<1x1xf32>
%66 = tensor.collapse_shape %65 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%67 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1xf32>) outs(%67 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%69 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%70 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%69 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%148 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %148 : i1
} -> tensor<1x10xi1>
%71 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%72 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%73 = linalg.generic {indexing_maps = [#map8, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%52, %71 : tensor<i32>, tensor<1x64xf32>) outs(%72 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32, %arg4: f32):
%148 = arith.index_cast %arg2 : i32 to index
%149 = linalg.index 0 : index
%150 = linalg.index 1 : index
%151 = tensor.extract %58[%148, %149, %150] : tensor<5x1x64xf32>
linalg.yield %151 : f32
} -> tensor<1x64xf32>
%c0 = arith.constant 0 : index
%c0_17 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c1_18 = arith.constant 1 : index
%c0_19 = arith.constant 0 : index
%c1_20 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c1_21 = arith.constant 1 : index
%c0_22 = arith.constant 0 : index
%c1_23 = arith.constant 1 : index
%c64_24 = arith.constant 64 : index
%c64_25 = arith.constant 64 : index
%c1_26 = arith.constant 1 : index
%c10 = arith.constant 10 : index
%c74 = arith.constant 74 : index
%74 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%c0_27 = arith.constant 0 : index
%c1_28 = arith.constant 1 : index
%c64_29 = arith.constant 64 : index
%75 = tensor.insert_slice %73 into %74[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%76 = arith.addi %c0_27, %c64_29 : index
%c1_30 = arith.constant 1 : index
%c10_31 = arith.constant 10 : index
%77 = tensor.insert_slice %57 into %75[0, %76] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%78 = arith.addi %76, %c10_31 : index
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%cst_32 = arith.constant 0.000000e+00 : f32
%80 = linalg.fill ins(%cst_32 : f32) outs(%79 : tensor<1x40xf32>) -> tensor<1x40xf32>
%81 = linalg.matmul ins(%77, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%80 : tensor<1x40xf32>) -> tensor<1x40xf32>
%82 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%83 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %82 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%83 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x40xf32>
%85 = tensor.extract_slice %84[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%92 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%92 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%94 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%95 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%93, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%94 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%96 = tensor.extract_slice %84[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98 : tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%101 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%102 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%101 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%103 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%104 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%102, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%103 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%105 = tensor.extract_slice %84[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%105 : tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%104, %107 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%95, %109 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.minf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%114 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%115 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%113, %cst_1 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%114 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.maxf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%116 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%117 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %56, %115 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%116 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%148 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%118 = tensor.extract_slice %84[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120 : tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%124, %cst_0 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.addf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%115 : tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%148 = math.tanh %arg2 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%129 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%130 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%126, %128 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%129 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%148 = arith.mulf %arg2, %arg3 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%131 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%132 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %57, %130 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%131 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%148 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %148 : f32
} -> tensor<1x10xf32>
%133 = tensor.expand_shape %132 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%c0_i32 = arith.constant 0 : i32
%134 = tensor.extract %52[] : tensor<i32>
%c4_i32 = arith.constant 4 : i32
%135 = arith.maxsi %c0_i32, %134 : i32
%136 = arith.minsi %135, %c4_i32 : i32
%137 = arith.index_cast %136 : i32 to index
%138 = tensor.extract %cst[] : tensor<i32>
%c0_i32_33 = arith.constant 0 : i32
%139 = arith.maxsi %c0_i32, %138 : i32
%140 = arith.minsi %139, %c0_i32_33 : i32
%141 = arith.index_cast %140 : i32 to index
%142 = tensor.extract %cst[] : tensor<i32>
%c0_i32_34 = arith.constant 0 : i32
%143 = arith.maxsi %c0_i32, %142 : i32
%144 = arith.minsi %143, %c0_i32_34 : i32
%145 = arith.index_cast %144 : i32 to index
%146 = tensor.insert_slice %133 into %60[%137, %141, %145] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %117, %132, %58, %59, %146 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%147: tensor<5x1x10xf32>): // pred: ^bb1
return %147 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After IREEImportPublic //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After SanitizeModuleNames //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Inliner //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%23 = linalg.init_tensor [5] : tensor<5xf32>
%24 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%23 : tensor<5xf32>) : tensor<5xf32>
%25 = linalg.init_tensor [5] : tensor<5xi1>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%25 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<5xi1>
%27 = linalg.init_tensor [5] : tensor<5xi32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26 : tensor<5xi1>) outs(%27 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%136 = arith.extui %arg2 : i1 to i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%29 = linalg.init_tensor [5] : tensor<5xi32>
%30 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%29 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.muli %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<5xi32>
%31 = linalg.init_tensor [] : tensor<i32>
%32 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%31 : tensor<i32>) -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%30 : tensor<5xi32>) outs(%32 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%136 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%34 = linalg.init_tensor [] : tensor<i32>
%35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %33 : tensor<i32>, tensor<i32>) outs(%34 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_1, %35 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%136 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%38 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.subi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
cf.br ^bb1(%37, %39, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%40: tensor<i32>, %41: tensor<i32>, %42: tensor<40xf32>, %43: tensor<74x40xf32>, %44: tensor<1x10xf32>, %45: tensor<1x10xf32>, %46: tensor<5x1x64xf32>, %47: tensor<5x1x1xf32>, %48: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%49 = linalg.init_tensor [] : tensor<i1>
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%40, %41 : tensor<i32>, tensor<i32>) outs(%49 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%136 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %136 : i1
} -> tensor<i1>
%51 = tensor.extract %50[] : tensor<i1>
cf.cond_br %51, ^bb2(%40, %41, %42, %43, %44, %45, %46, %47, %48 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%48 : tensor<5x1x10xf32>)
^bb2(%52: tensor<i32>, %53: tensor<i32>, %54: tensor<40xf32>, %55: tensor<74x40xf32>, %56: tensor<1x10xf32>, %57: tensor<1x10xf32>, %58: tensor<5x1x64xf32>, %59: tensor<5x1x1xf32>, %60: tensor<5x1x10xf32>): // pred: ^bb1
%61 = linalg.init_tensor [] : tensor<i32>
%62 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%52, %cst_5 : tensor<i32>, tensor<i32>) outs(%61 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%136 = arith.addi %arg2, %arg3 : i32
linalg.yield %136 : i32
} -> tensor<i32>
%63 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%63 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %59[%136, %137, %138] : tensor<5x1x1xf32>
linalg.yield %139 : f32
} -> tensor<1x1xf32>
%65 = tensor.collapse_shape %64 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%66 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65 : tensor<1xf32>) outs(%66 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%68 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%68 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%136 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %136 : i1
} -> tensor<1x10xi1>
%70 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%52 : tensor<i32>) outs(%70 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%136 = arith.index_cast %arg2 : i32 to index
%137 = linalg.index 0 : index
%138 = linalg.index 1 : index
%139 = tensor.extract %58[%136, %137, %138] : tensor<5x1x64xf32>
linalg.yield %139 : f32
} -> tensor<1x64xf32>
%72 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%73 = tensor.insert_slice %71 into %72[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%74 = tensor.insert_slice %57 into %73[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%75 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%76 = linalg.fill ins(%cst_0 : f32) outs(%75 : tensor<1x40xf32>) -> tensor<1x40xf32>
%77 = linalg.matmul ins(%74, %55 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%76 : tensor<1x40xf32>) -> tensor<1x40xf32>
%78 = tensor.expand_shape %54 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%79 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%79 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x40xf32>
%81 = tensor.extract_slice %80[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%82 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%81, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%82 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%84 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83 : tensor<1x10xf32>) outs(%84 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%86 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%85, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%86 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%88 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%88 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%90 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %56 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%90 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%92 = tensor.extract_slice %80[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%93 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%94 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%92, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%93 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%95 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%96 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%94 : tensor<1x10xf32>) outs(%95 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%97 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%98 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%96, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%97 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%99 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%98, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%99 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%101 = tensor.extract_slice %80[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%102 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%101 : tensor<1x10xf32>) outs(%102 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%104 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%100, %103 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%104 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%106 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%91, %105 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%106 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%108 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%107, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%108 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.minf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%110 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%111 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%109, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%110 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.maxf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%112 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%113 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %56, %111 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%112 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%114 = tensor.extract_slice %80[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%115 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%116 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%114, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%115 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%117 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%118 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%116 : tensor<1x10xf32>) outs(%117 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%119 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%120 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%118, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%119 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%121 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%122 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%120, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%121 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.addf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%123 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%111 : tensor<1x10xf32>) outs(%123 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%136 = math.tanh %arg2 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%125 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%122, %124 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%125 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%136 = arith.mulf %arg2, %arg3 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%127 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %57, %126 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%127 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%136 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %136 : f32
} -> tensor<1x10xf32>
%129 = tensor.expand_shape %128 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%130 = tensor.extract %52[] : tensor<i32>
%131 = arith.maxsi %130, %c0_i32 : i32
%132 = arith.minsi %131, %c4_i32 : i32
%133 = arith.index_cast %132 : i32 to index
%134 = tensor.insert_slice %129 into %60[%133, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%62, %53, %54, %55, %113, %128, %58, %59, %134 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%135: tensor<5x1x10xf32>): // pred: ^bb1
return %135 : tensor<5x1x10xf32>
}
// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
// -----// IR Dump After SymbolDCE //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After VerifyInputLegality //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
// -----// IR Dump After VerifyInputLegality //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
// -----// IR Dump After LinalgNamedOpConversion //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After ExpandTensorShapes //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst = arith.constant 0x7F800000 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0> : tensor<i32>
%cst_2 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_3 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1> : tensor<i32>
%cst_6 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_7 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_0 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%101 = arith.extui %arg2 : i1 to i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.muli %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%101 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_1, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%101 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.subi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %29, %cst_8, %cst_7, %cst_12, %cst_12, %3, %6, %cst_6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<i32>, %32: tensor<40xf32>, %33: tensor<74x40xf32>, %34: tensor<1x10xf32>, %35: tensor<1x10xf32>, %36: tensor<5x1x64xf32>, %37: tensor<5x1x1xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %31 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%101 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %101 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2(%30, %31, %32, %33, %34, %35, %36, %37, %38 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%38 : tensor<5x1x10xf32>)
^bb2(%41: tensor<i32>, %42: tensor<i32>, %43: tensor<40xf32>, %44: tensor<74x40xf32>, %45: tensor<1x10xf32>, %46: tensor<1x10xf32>, %47: tensor<5x1x64xf32>, %48: tensor<5x1x1xf32>, %49: tensor<5x1x10xf32>): // pred: ^bb1
%50 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%41, %cst_5 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%101 = arith.addi %arg2, %arg3 : i32
linalg.yield %101 : i32
} -> tensor<i32>
%51 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%51 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %48[%101, %102, %103] : tensor<5x1x1xf32>
linalg.yield %104 : f32
} -> tensor<1x1xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%56 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%56 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%101 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %101 : i1
} -> tensor<1x10xi1>
%58 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<i32>) outs(%58 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%101 = arith.index_cast %arg2 : i32 to index
%102 = linalg.index 0 : index
%103 = linalg.index 1 : index
%104 = tensor.extract %47[%101, %102, %103] : tensor<5x1x64xf32>
linalg.yield %104 : f32
} -> tensor<1x64xf32>
%60 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%61 = tensor.insert_slice %59 into %60[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%62 = tensor.insert_slice %46 into %61[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%63 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%64 = linalg.fill ins(%cst_0 : f32) outs(%63 : tensor<1x40xf32>) -> tensor<1x40xf32>
%65 = linalg.matmul ins(%62, %44 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%64 : tensor<1x40xf32>) -> tensor<1x40xf32>
%66 = tensor.expand_shape %43 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %66 : tensor<1x40xf32>, tensor<1x40xf32>) outs(%63 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x40xf32>
%68 = tensor.extract_slice %67[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72, %45 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%74 = tensor.extract_slice %67[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%79 = tensor.extract_slice %67[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%79 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%78, %80 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %81 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%82, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.minf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%83, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.maxf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %45, %84 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%86 = tensor.extract_slice %67[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%87 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%86, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%87 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%89 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%88, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%89, %cst_2 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.addf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%84 : tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%101 = math.tanh %arg2 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%90, %91 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%101 = arith.mulf %arg2, %arg3 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%93 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %46, %92 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%101 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %101 : f32
} -> tensor<1x10xf32>
%94 = tensor.expand_shape %93 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%95 = tensor.extract %41[] : tensor<i32>
%96 = arith.maxsi %95, %c0_i32 : i32
%97 = arith.minsi %96, %c4_i32 : i32
%98 = arith.index_cast %97 : i32 to index
%99 = tensor.insert_slice %94 into %49[%98, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%50, %42, %43, %44, %85, %93, %47, %48, %99 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
^bb3(%100: tensor<5x1x10xf32>): // pred: ^bb1
return %100 : tensor<5x1x10xf32>
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_1 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%85 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%34 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%35 = tensor.extract %34[] : tensor<i1>
cf.cond_br %35, ^bb2, ^bb3
^bb2: // pred: ^bb1
%36 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.addi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%38 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
linalg.yield %88 : f32
} -> tensor<1x1xf32>
%39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%41 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%43 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<1x10xi1>
%44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%45 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
linalg.yield %88 : f32
} -> tensor<1x64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
%51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x40xf32>
%53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%54 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%61 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%66 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.maxf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%80 = tensor.extract %30[] : tensor<i32>
%81 = arith.maxsi %80, %c0_i32 : i32
%82 = arith.minsi %81, %c4_i32 : i32
%83 = arith.index_cast %82 : i32 to index
%84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %33 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_1 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%85 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%34 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%35 = tensor.extract %34[] : tensor<i1>
cf.cond_br %35, ^bb2, ^bb3
^bb2: // pred: ^bb1
%36 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.addi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%38 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
linalg.yield %88 : f32
} -> tensor<1x1xf32>
%39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%41 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%43 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<1x10xi1>
%44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%45 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
linalg.yield %88 : f32
} -> tensor<1x64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
%51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x40xf32>
%53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%54 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%61 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%66 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.maxf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%80 = tensor.extract %30[] : tensor<i32>
%81 = arith.maxsi %80, %c0_i32 : i32
%82 = arith.minsi %81, %c4_i32 : i32
%83 = arith.index_cast %82 : i32 to index
%84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %33 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_1 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%85 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%35 = tensor.extract %34[] : tensor<i1>
cf.cond_br %35, ^bb2, ^bb3
^bb2: // pred: ^bb1
%36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.addi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
linalg.yield %88 : f32
} -> tensor<1x1xf32>
%39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<1x10xi1>
%44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
linalg.yield %88 : f32
} -> tensor<1x64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
%51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x40xf32>
%53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%61 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%66 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.maxf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%80 = tensor.extract %30[] : tensor<i32>
%81 = arith.maxsi %80, %c0_i32 : i32
%82 = arith.minsi %81, %c4_i32 : i32
%83 = arith.index_cast %82 : i32 to index
%84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %33 : tensor<5x1x10xf32>
}
// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_1 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%85 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%35 = tensor.extract %34[] : tensor<i1>
cf.cond_br %35, ^bb2, ^bb3
^bb2: // pred: ^bb1
%36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.addi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
linalg.yield %88 : f32
} -> tensor<1x1xf32>
%39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<1x10xi1>
%44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
linalg.yield %88 : f32
} -> tensor<1x64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
%51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x40xf32>
%53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%61 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%66 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.maxf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%80 = tensor.extract %30[] : tensor<i32>
%81 = arith.maxsi %80, %c0_i32 : i32
%82 = arith.minsi %81, %c4_i32 : i32
%83 = arith.index_cast %82 : i32 to index
%84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %33 : tensor<5x1x10xf32>
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1) -> (d1, d0)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0)>
#map5 = affine_map<(d0) -> (d0)>
#map6 = affine_map<(d0) -> ()>
#map7 = affine_map<() -> ()>
#map8 = affine_map<(d0, d1) -> ()>
#map9 = affine_map<(d0, d1) -> (d0)>
module {
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = tensor.pad %0 low[0, 0, 0] high[0, 0, 60] {
^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %cst_1 : f32
} : tensor<1x5x4xf32> to tensor<1x5x64xf32>
%2 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<1x5x64xf32>) outs(%2 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%4 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%5 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%4 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%6 = tensor.expand_shape %5 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%7 = linalg.init_tensor [5] : tensor<5xf32>
%8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<5xf32>) -> tensor<5xf32>
%9 = linalg.generic {indexing_maps = [#map1, #map4], iterator_types = ["parallel", "reduction", "reduction"]} ins(%6 : tensor<5x1x1xf32>) outs(%8 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<5xf32>
%10 = linalg.init_tensor [5] : tensor<5xi1>
%11 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%9, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%12 = linalg.init_tensor [5] : tensor<5xi32>
%13 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%11 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%14 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%13, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%15 = linalg.init_tensor [] : tensor<i32>
%16 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%17 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%14 : tensor<5xi32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%18 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %17 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%19 = linalg.init_tensor [] : tensor<i1>
%20 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%18, %cst_9 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%21 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%9 : tensor<5xf32>) outs(%7 : tensor<5xf32>) : tensor<5xf32>
%22 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%21, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%10 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<5xi1>
%23 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel"]} ins(%22 : tensor<5xi1>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%85 = arith.extui %arg2 : i1 to i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%24 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel"]} ins(%23, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%12 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.muli %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<5xi32>
%25 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%15 : tensor<i32>) -> tensor<i32>
%26 = linalg.generic {indexing_maps = [#map5, #map6], iterator_types = ["reduction"]} ins(%24 : tensor<5xi32>) outs(%25 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%85 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %26 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [#map7, #map7, #map7, #map7], iterator_types = []} ins(%20, %cst_2, %27 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%85 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%cst_9, %18 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.subi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
cf.br ^bb1(%28, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%30: tensor<i32>, %31: tensor<1x10xf32>, %32: tensor<1x10xf32>, %33: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%34 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %29 : tensor<i32>, tensor<i32>) outs(%19 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%85 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %85 : i1
} -> tensor<i1>
%35 = tensor.extract %34[] : tensor<i1>
cf.cond_br %35, ^bb2, ^bb3
^bb2: // pred: ^bb1
%36 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = []} ins(%30, %cst_6 : tensor<i32>, tensor<i32>) outs(%15 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%85 = arith.addi %arg2, %arg3 : i32
linalg.yield %85 : i32
} -> tensor<i32>
%37 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%38 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%37 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %6[%85, %86, %87] : tensor<5x1x1xf32>
linalg.yield %88 : f32
} -> tensor<1x1xf32>
%39 = tensor.collapse_shape %38 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%40 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%41 = linalg.generic {indexing_maps = [#map9, #map3], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%43 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%41, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%85 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %85 : i1
} -> tensor<1x10xi1>
%44 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%45 = linalg.generic {indexing_maps = [#map8, #map3], iterator_types = ["parallel", "parallel"]} ins(%30 : tensor<i32>) outs(%44 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%85 = arith.index_cast %arg2 : i32 to index
%86 = linalg.index 0 : index
%87 = linalg.index 1 : index
%88 = tensor.extract %3[%85, %86, %87] : tensor<5x1x64xf32>
linalg.yield %88 : f32
} -> tensor<1x64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%48 = tensor.insert_slice %32 into %47[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%49 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%50 = linalg.fill ins(%cst_1 : f32) outs(%49 : tensor<1x40xf32>) -> tensor<1x40xf32>
%51 = linalg.matmul ins(%48, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%51, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%49 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x40xf32>
%53 = tensor.extract_slice %52[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%54 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%53, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%54 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%56, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%57, %31 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%59 = tensor.extract_slice %52[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%59, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%61 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%60 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%62, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%64 = tensor.extract_slice %52[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%64 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%66 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%63, %65 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%58, %66 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%67, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.minf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%68, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.maxf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %31, %69 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%71 = tensor.extract_slice %52[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%71, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%73 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%72 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%74, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.addf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%69 : tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%85 = math.tanh %arg2 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%75, %76 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%85 = arith.mulf %arg2, %arg3 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%43, %32, %77 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%40 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%85 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %85 : f32
} -> tensor<1x10xf32>
%79 = tensor.expand_shape %78 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%80 = tensor.extract %30[] : tensor<i32>
%81 = arith.maxsi %80, %c0_i32 : i32
%82 = arith.minsi %81, %c4_i32 : i32
%83 = arith.index_cast %82 : i32 to index
%84 = tensor.insert_slice %79 into %33[%83, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%36, %70, %78, %84 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %33 : tensor<5x1x10xf32>
}
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After FusionOfTensorOps //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After SplitReduction //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CaptureDispatchDynamicDims //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func.func @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<1x5xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<1x5x2x2xf32>
%2 = call @_main(%0, %1) : (tensor<1x5xf32>, tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32>
%3 = hal.tensor.export %2 : tensor<5x1x10xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%2 = linalg.fill ins(%cst_1 : f32) outs(%1 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%3 = tensor.insert_slice %0 into %2[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<1x5x4xf32> into tensor<1x5x64xf32>
%4 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x5x64xf32>) outs(%4 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%6 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%6 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%8 = tensor.expand_shape %7 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%9 = linalg.init_tensor [5] : tensor<5xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<5xf32>) -> tensor<5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8 : tensor<5x1x1xf32>) outs(%10 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = arith.minf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<5xf32>
%12 = linalg.init_tensor [5] : tensor<5xi1>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%87 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %87 : i1
} -> tensor<5xi1>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%87 = arith.extui %arg2 : i1 to i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%15, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.muli %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%17 = linalg.init_tensor [] : tensor<i32>
%18 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%16 : tensor<5xi32>) outs(%18 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%87 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %19 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.subi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%87 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %87 : i1
} -> tensor<i1>
%23 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%11 : tensor<5xf32>) outs(%9 : tensor<5xf32>) : tensor<5xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%87 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %87 : i1
} -> tensor<5xi1>
%25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%87 = arith.extui %arg2 : i1 to i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%25, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.muli %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%27 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%26 : tensor<5xi32>) outs(%27 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%87 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %28 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.subi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_2, %29 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%87 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%31 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.subi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
cf.br ^bb1(%30, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%32: tensor<i32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %31 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%87 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %87 : i1
} -> tensor<i1>
%37 = tensor.extract %36[] : tensor<i1>
cf.cond_br %37, ^bb2, ^bb3
^bb2: // pred: ^bb1
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %cst_6 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.addi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%39 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%39 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%87 = arith.index_cast %arg2 : i32 to index
%88 = linalg.index 0 : index
%89 = linalg.index 1 : index
%90 = tensor.extract %8[%87, %88, %89] : tensor<5x1x1xf32>
linalg.yield %90 : f32
} -> tensor<1x1xf32>
%41 = tensor.collapse_shape %40 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<1xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%44 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%44 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%87 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %87 : i1
} -> tensor<1x10xi1>
%46 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%46 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%87 = arith.index_cast %arg2 : i32 to index
%88 = linalg.index 0 : index
%89 = linalg.index 1 : index
%90 = tensor.extract %5[%87, %88, %89] : tensor<5x1x64xf32>
linalg.yield %90 : f32
} -> tensor<1x64xf32>
%48 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%49 = tensor.insert_slice %47 into %48[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%50 = tensor.insert_slice %34 into %49[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%51 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%52 = linalg.fill ins(%cst_1 : f32) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
%53 = linalg.matmul ins(%50, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%52 : tensor<1x40xf32>) -> tensor<1x40xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%51 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x40xf32>
%55 = tensor.extract_slice %54[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %33 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%61 = tensor.extract_slice %54[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%66 = tensor.extract_slice %54[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %67 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60, %68 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.minf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.maxf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %33, %71 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%87 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%73 = tensor.extract_slice %54[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %34, %79 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%87 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%81 = tensor.expand_shape %80 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%82 = tensor.extract %32[] : tensor<i32>
%83 = arith.maxsi %82, %c0_i32 : i32
%84 = arith.minsi %83, %c4_i32 : i32
%85 = arith.index_cast %84 : i32 to index
%86 = tensor.insert_slice %81 into %35[%85, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%38, %72, %80, %86 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %35 : tensor<5x1x10xf32>
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<1x40xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant dense<0> : tensor<i32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<1x10xf32>
%cst_4 = arith.constant dense<-1.000000e+01> : tensor<1x10xf32>
%cst_5 = arith.constant dense<1.000000e+01> : tensor<1x10xf32>
%cst_6 = arith.constant dense<1> : tensor<i32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_8 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_9 = arith.constant dense<5> : tensor<i32>
%cst_10 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_11 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
%1 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%2 = linalg.fill ins(%cst_1 : f32) outs(%1 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%3 = tensor.insert_slice %0 into %2[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<1x5x4xf32> into tensor<1x5x64xf32>
%4 = linalg.init_tensor [5, 1, 64] : tensor<5x1x64xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<1x5x64xf32>) outs(%4 : tensor<5x1x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1x64xf32>
%6 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%6 : tensor<5x1xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x1xf32>
%8 = tensor.expand_shape %7 [[0], [1, 2]] : tensor<5x1xf32> into tensor<5x1x1xf32>
%9 = linalg.init_tensor [5] : tensor<5xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<5xf32>) -> tensor<5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8 : tensor<5x1x1xf32>) outs(%10 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = arith.minf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<5xf32>
%12 = linalg.init_tensor [5] : tensor<5xi1>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%87 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %87 : i1
} -> tensor<5xi1>
%14 = linalg.init_tensor [5] : tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%13 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%87 = arith.extui %arg2 : i1 to i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%15, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.muli %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%17 = linalg.init_tensor [] : tensor<i32>
%18 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%16 : tensor<5xi32>) outs(%18 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%87 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %19 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.subi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%21 = linalg.init_tensor [] : tensor<i1>
%22 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%20, %cst_9 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%87 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %87 : i1
} -> tensor<i1>
%23 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%11 : tensor<5xf32>) outs(%9 : tensor<5xf32>) : tensor<5xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23, %cst_11 : tensor<5xf32>, tensor<5xf32>) outs(%12 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%87 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %87 : i1
} -> tensor<5xi1>
%25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24 : tensor<5xi1>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%87 = arith.extui %arg2 : i1 to i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%25, %cst_10 : tensor<5xi32>, tensor<5xi32>) outs(%14 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.muli %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<5xi32>
%27 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%17 : tensor<i32>) -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%26 : tensor<5xi32>) outs(%27 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%87 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %28 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.subi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %cst_2, %29 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%87 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%31 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_9, %20 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.subi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
cf.br ^bb1(%30, %cst_12, %cst_12, %cst_7 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%32: tensor<i32>, %33: tensor<1x10xf32>, %34: tensor<1x10xf32>, %35: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%36 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %31 : tensor<i32>, tensor<i32>) outs(%21 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%87 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %87 : i1
} -> tensor<i1>
%37 = tensor.extract %36[] : tensor<i1>
cf.cond_br %37, ^bb2, ^bb3
^bb2: // pred: ^bb1
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%32, %cst_6 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%87 = arith.addi %arg2, %arg3 : i32
linalg.yield %87 : i32
} -> tensor<i32>
%39 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%39 : tensor<1x1xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%87 = arith.index_cast %arg2 : i32 to index
%88 = linalg.index 0 : index
%89 = linalg.index 1 : index
%90 = tensor.extract %8[%87, %88, %89] : tensor<5x1x1xf32>
linalg.yield %90 : f32
} -> tensor<1x1xf32>
%41 = tensor.collapse_shape %40 [[0, 1]] : tensor<1x1xf32> into tensor<1xf32>
%42 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%43 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%41 : tensor<1xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<1x10xf32>
%44 = linalg.init_tensor [1, 10] : tensor<1x10xi1>
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%43, %cst_12 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%44 : tensor<1x10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%87 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %87 : i1
} -> tensor<1x10xi1>
%46 = linalg.init_tensor [1, 64] : tensor<1x64xf32>
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%32 : tensor<i32>) outs(%46 : tensor<1x64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%87 = arith.index_cast %arg2 : i32 to index
%88 = linalg.index 0 : index
%89 = linalg.index 1 : index
%90 = tensor.extract %5[%87, %88, %89] : tensor<5x1x64xf32>
linalg.yield %90 : f32
} -> tensor<1x64xf32>
%48 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%49 = tensor.insert_slice %47 into %48[0, 0] [1, 64] [1, 1] : tensor<1x64xf32> into tensor<1x74xf32>
%50 = tensor.insert_slice %34 into %49[0, 64] [1, 10] [1, 1] : tensor<1x10xf32> into tensor<1x74xf32>
%51 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%52 = linalg.fill ins(%cst_1 : f32) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
%53 = linalg.matmul ins(%50, %cst_8 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%52 : tensor<1x40xf32>) -> tensor<1x40xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%53, %cst : tensor<1x40xf32>, tensor<1x40xf32>) outs(%51 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x40xf32>
%55 = tensor.extract_slice %54[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%55, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%56 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%57, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%58, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%59, %33 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%61 = tensor.extract_slice %54[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%61, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%62 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%64 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%63, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%64, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%66 = tensor.extract_slice %54[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%66 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%65, %67 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%60, %68 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%69, %cst_5 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.minf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%70, %cst_4 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.maxf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %33, %71 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%87 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%73 = tensor.extract_slice %54[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<1x10xf32>
%74 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%73, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%74 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%76 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%75, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%76, %cst_3 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.addf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%87 = math.tanh %arg2 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%77, %78 : tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%87 = arith.mulf %arg2, %arg3 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %34, %79 : tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%42 : tensor<1x10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%87 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %87 : f32
} -> tensor<1x10xf32>
%81 = tensor.expand_shape %80 [[0], [1, 2]] : tensor<1x10xf32> into tensor<1x1x10xf32>
%82 = tensor.extract %32[] : tensor<i32>
%83 = arith.maxsi %82, %c0_i32 : i32
%84 = arith.minsi %83, %c4_i32 : i32
%85 = arith.index_cast %84 : i32 to index
%86 = tensor.insert_slice %81 into %35[%85, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<1x1x10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%38, %72, %80, %86 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %35 : tensor<5x1x10xf32>
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
%cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_4 = arith.constant 0x7F800000 : f32
%cst_5 = arith.constant 0.000000e+00 : f32
%cst_6 = arith.constant dense<0> : tensor<i32>
%cst_7 = arith.constant dense<1> : tensor<i32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_10 = arith.constant dense<5> : tensor<i32>
%cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = linalg.init_tensor [5, 64] : tensor<5x64xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<5x64xf32>) outs(%5 : tensor<5x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x64xf32>
%7 = tensor.expand_shape %6 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%8 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%9 = linalg.init_tensor [5] : tensor<5xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%8 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5xf32>
%11 = tensor.expand_shape %10 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%12 = linalg.init_tensor [5] : tensor<5xf32>
%13 = linalg.fill ins(%cst_4 : f32) outs(%12 : tensor<5xf32>) -> tensor<5xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<5xf32>) outs(%13 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = arith.minf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<5xf32>
%15 = linalg.init_tensor [5] : tensor<5xi1>
%16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<5xi1>
%17 = linalg.init_tensor [5] : tensor<5xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%16 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%119 = arith.extui %arg2 : i1 to i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%18, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.muli %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%20 = linalg.init_tensor [] : tensor<i32>
%21 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%19 : tensor<5xi32>) outs(%21 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%119 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%23 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %22 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%24 = linalg.init_tensor [] : tensor<i1>
%25 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%23, %cst_10 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%119 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %119 : i1
} -> tensor<i1>
%26 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%14 : tensor<5xf32>) outs(%12 : tensor<5xf32>) : tensor<5xf32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<5xi1>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%27 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%119 = arith.extui %arg2 : i1 to i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%29 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.muli %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%30 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%29 : tensor<5xi32>) outs(%30 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%119 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%32 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %31 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%25, %cst_6, %32 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%119 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %23 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
cf.br ^bb1(%33, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%35: tensor<i32>, %36: tensor<1x10xf32>, %37: tensor<1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %34 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%119 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %119 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2, ^bb3
^bb2: // pred: ^bb1
%41 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %cst_7 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.addi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%42 = linalg.init_tensor [] : tensor<f32>
%43 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%42 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%119 = arith.index_cast %arg2 : i32 to index
%120 = tensor.extract %11[%119, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %120 : f32
} -> tensor<f32>
%44 = linalg.init_tensor [10] : tensor<10xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43 : tensor<f32>) outs(%44 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<10xf32>
%46 = linalg.init_tensor [10] : tensor<10xi1>
%47 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45, %cst : tensor<10xf32>, tensor<10xf32>) outs(%46 : tensor<10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<10xi1>
%48 = linalg.init_tensor [64] : tensor<64xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35 : tensor<i32>) outs(%48 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%119 = arith.index_cast %arg2 : i32 to index
%120 = linalg.index 0 : index
%121 = tensor.extract %7[%119, %c0, %120] : tensor<5x1x64xf32>
linalg.yield %121 : f32
} -> tensor<64xf32>
%50 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%51 = tensor.insert_slice %49 into %50[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%52 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%53 = tensor.insert_slice %52 into %51[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%54 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%55 = linalg.fill ins(%cst_5 : f32) outs(%54 : tensor<1x40xf32>) -> tensor<1x40xf32>
%56 = linalg.matmul ins(%53, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%55 : tensor<1x40xf32>) -> tensor<1x40xf32>
%57 = tensor.collapse_shape %56 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
%58 = linalg.init_tensor [40] : tensor<40xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%58 : tensor<40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<40xf32>
%60 = tensor.expand_shape %59 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%61 = tensor.extract_slice %60[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%62 = linalg.init_tensor [10] : tensor<10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%64 = linalg.init_tensor [10] : tensor<10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63 : tensor<10xf32>) outs(%64 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%66 = linalg.init_tensor [10] : tensor<10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%66 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%68 = linalg.init_tensor [10] : tensor<10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%68 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%70 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%71 = linalg.init_tensor [10] : tensor<10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %70 : tensor<10xf32>, tensor<10xf32>) outs(%71 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%73 = tensor.extract_slice %60[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%74 = linalg.init_tensor [10] : tensor<10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%76 = linalg.init_tensor [10] : tensor<10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75 : tensor<10xf32>) outs(%76 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%78 = linalg.init_tensor [10] : tensor<10xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%78 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%80 = linalg.init_tensor [10] : tensor<10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%80 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%82 = tensor.extract_slice %60[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%83 = linalg.init_tensor [10] : tensor<10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%82 : tensor<10xf32>) outs(%83 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%85 = linalg.init_tensor [10] : tensor<10xf32>
%86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %84 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%87 = linalg.init_tensor [10] : tensor<10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %86 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%89 = linalg.init_tensor [10] : tensor<10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%89 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.minf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%91 = linalg.init_tensor [10] : tensor<10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%90, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%91 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.maxf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%93 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%94 = linalg.init_tensor [10] : tensor<10xf32>
%95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %93, %92 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%119 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%96 = tensor.expand_shape %95 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%97 = tensor.extract_slice %60[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%98 = linalg.init_tensor [10] : tensor<10xf32>
%99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%100 = linalg.init_tensor [10] : tensor<10xf32>
%101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99 : tensor<10xf32>) outs(%100 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%102 = linalg.init_tensor [10] : tensor<10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%102 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%104 = linalg.init_tensor [10] : tensor<10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%103, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%106 = linalg.init_tensor [10] : tensor<10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%92 : tensor<10xf32>) outs(%106 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%108 = linalg.init_tensor [10] : tensor<10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105, %107 : tensor<10xf32>, tensor<10xf32>) outs(%108 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%110 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%111 = linalg.init_tensor [10] : tensor<10xf32>
%112 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %110, %109 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%111 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%119 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%113 = tensor.expand_shape %112 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%114 = tensor.extract %35[] : tensor<i32>
%115 = arith.maxsi %114, %c0_i32 : i32
%116 = arith.minsi %115, %c4_i32 : i32
%117 = arith.index_cast %116 : i32 to index
%118 = tensor.insert_slice %112 into %38[%117, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%41, %96, %113, %118 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %38 : tensor<5x1x10xf32>
}
// -----// IR Dump After InterchangeGenericOps //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
%cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_4 = arith.constant 0x7F800000 : f32
%cst_5 = arith.constant 0.000000e+00 : f32
%cst_6 = arith.constant dense<0> : tensor<i32>
%cst_7 = arith.constant dense<1> : tensor<i32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_10 = arith.constant dense<5> : tensor<i32>
%cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = linalg.init_tensor [5, 64] : tensor<5x64xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<5x64xf32>) outs(%5 : tensor<5x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x64xf32>
%7 = tensor.expand_shape %6 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%8 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%9 = linalg.init_tensor [5] : tensor<5xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%8 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5xf32>
%11 = tensor.expand_shape %10 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%12 = linalg.init_tensor [5] : tensor<5xf32>
%13 = linalg.fill ins(%cst_4 : f32) outs(%12 : tensor<5xf32>) -> tensor<5xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<5xf32>) outs(%13 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = arith.minf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<5xf32>
%15 = linalg.init_tensor [5] : tensor<5xi1>
%16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<5xi1>
%17 = linalg.init_tensor [5] : tensor<5xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%16 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%119 = arith.extui %arg2 : i1 to i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%18, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.muli %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%20 = linalg.init_tensor [] : tensor<i32>
%21 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%19 : tensor<5xi32>) outs(%21 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%119 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%23 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %22 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%24 = linalg.init_tensor [] : tensor<i1>
%25 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%23, %cst_10 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%119 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %119 : i1
} -> tensor<i1>
%26 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%14 : tensor<5xf32>) outs(%12 : tensor<5xf32>) : tensor<5xf32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<5xi1>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%27 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%119 = arith.extui %arg2 : i1 to i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%29 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.muli %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%30 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%29 : tensor<5xi32>) outs(%30 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%119 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%32 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %31 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%25, %cst_6, %32 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%119 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %23 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
cf.br ^bb1(%33, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%35: tensor<i32>, %36: tensor<1x10xf32>, %37: tensor<1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %34 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%119 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %119 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2, ^bb3
^bb2: // pred: ^bb1
%41 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %cst_7 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.addi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%42 = linalg.init_tensor [] : tensor<f32>
%43 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%42 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%119 = arith.index_cast %arg2 : i32 to index
%120 = tensor.extract %11[%119, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %120 : f32
} -> tensor<f32>
%44 = linalg.init_tensor [10] : tensor<10xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43 : tensor<f32>) outs(%44 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<10xf32>
%46 = linalg.init_tensor [10] : tensor<10xi1>
%47 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45, %cst : tensor<10xf32>, tensor<10xf32>) outs(%46 : tensor<10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<10xi1>
%48 = linalg.init_tensor [64] : tensor<64xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35 : tensor<i32>) outs(%48 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%119 = arith.index_cast %arg2 : i32 to index
%120 = linalg.index 0 : index
%121 = tensor.extract %7[%119, %c0, %120] : tensor<5x1x64xf32>
linalg.yield %121 : f32
} -> tensor<64xf32>
%50 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%51 = tensor.insert_slice %49 into %50[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%52 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%53 = tensor.insert_slice %52 into %51[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%54 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%55 = linalg.fill ins(%cst_5 : f32) outs(%54 : tensor<1x40xf32>) -> tensor<1x40xf32>
%56 = linalg.matmul ins(%53, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%55 : tensor<1x40xf32>) -> tensor<1x40xf32>
%57 = tensor.collapse_shape %56 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
%58 = linalg.init_tensor [40] : tensor<40xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%58 : tensor<40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<40xf32>
%60 = tensor.expand_shape %59 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%61 = tensor.extract_slice %60[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%62 = linalg.init_tensor [10] : tensor<10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%64 = linalg.init_tensor [10] : tensor<10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63 : tensor<10xf32>) outs(%64 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%66 = linalg.init_tensor [10] : tensor<10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%66 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%68 = linalg.init_tensor [10] : tensor<10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%68 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%70 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%71 = linalg.init_tensor [10] : tensor<10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %70 : tensor<10xf32>, tensor<10xf32>) outs(%71 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%73 = tensor.extract_slice %60[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%74 = linalg.init_tensor [10] : tensor<10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%76 = linalg.init_tensor [10] : tensor<10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75 : tensor<10xf32>) outs(%76 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%78 = linalg.init_tensor [10] : tensor<10xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%78 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%80 = linalg.init_tensor [10] : tensor<10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%80 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%82 = tensor.extract_slice %60[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%83 = linalg.init_tensor [10] : tensor<10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%82 : tensor<10xf32>) outs(%83 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%85 = linalg.init_tensor [10] : tensor<10xf32>
%86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %84 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%87 = linalg.init_tensor [10] : tensor<10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %86 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%89 = linalg.init_tensor [10] : tensor<10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%89 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.minf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%91 = linalg.init_tensor [10] : tensor<10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%90, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%91 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.maxf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%93 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%94 = linalg.init_tensor [10] : tensor<10xf32>
%95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %93, %92 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%119 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%96 = tensor.expand_shape %95 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%97 = tensor.extract_slice %60[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%98 = linalg.init_tensor [10] : tensor<10xf32>
%99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%100 = linalg.init_tensor [10] : tensor<10xf32>
%101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99 : tensor<10xf32>) outs(%100 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%102 = linalg.init_tensor [10] : tensor<10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%102 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%104 = linalg.init_tensor [10] : tensor<10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%103, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%106 = linalg.init_tensor [10] : tensor<10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%92 : tensor<10xf32>) outs(%106 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%108 = linalg.init_tensor [10] : tensor<10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105, %107 : tensor<10xf32>, tensor<10xf32>) outs(%108 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%110 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%111 = linalg.init_tensor [10] : tensor<10xf32>
%112 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %110, %109 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%111 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%119 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%113 = tensor.expand_shape %112 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%114 = tensor.extract %35[] : tensor<i32>
%115 = arith.maxsi %114, %c0_i32 : i32
%116 = arith.minsi %115, %c4_i32 : i32
%117 = arith.index_cast %116 : i32 to index
%118 = tensor.insert_slice %112 into %38[%117, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%41, %96, %113, %118 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %38 : tensor<5x1x10xf32>
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
%cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_4 = arith.constant 0x7F800000 : f32
%cst_5 = arith.constant 0.000000e+00 : f32
%cst_6 = arith.constant dense<0> : tensor<i32>
%cst_7 = arith.constant dense<1> : tensor<i32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_10 = arith.constant dense<5> : tensor<i32>
%cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = linalg.init_tensor [5, 64] : tensor<5x64xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<5x64xf32>) outs(%5 : tensor<5x64xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5x64xf32>
%7 = tensor.expand_shape %6 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%8 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%9 = linalg.init_tensor [5] : tensor<5xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%8 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<5xf32>
%11 = tensor.expand_shape %10 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%12 = linalg.init_tensor [5] : tensor<5xf32>
%13 = linalg.fill ins(%cst_4 : f32) outs(%12 : tensor<5xf32>) -> tensor<5xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10 : tensor<5xf32>) outs(%13 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = arith.minf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<5xf32>
%15 = linalg.init_tensor [5] : tensor<5xi1>
%16 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<5xi1>
%17 = linalg.init_tensor [5] : tensor<5xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%16 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%119 = arith.extui %arg2 : i1 to i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%18, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.muli %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%20 = linalg.init_tensor [] : tensor<i32>
%21 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%19 : tensor<5xi32>) outs(%21 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%119 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%23 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %22 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%24 = linalg.init_tensor [] : tensor<i1>
%25 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%23, %cst_10 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%119 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %119 : i1
} -> tensor<i1>
%26 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%14 : tensor<5xf32>) outs(%12 : tensor<5xf32>) : tensor<5xf32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%26, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%15 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<5xi1>
%28 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%27 : tensor<5xi1>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%119 = arith.extui %arg2 : i1 to i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%29 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%28, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%17 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.muli %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<5xi32>
%30 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%20 : tensor<i32>) -> tensor<i32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%29 : tensor<5xi32>) outs(%30 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%119 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%32 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %31 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%33 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%25, %cst_6, %32 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%119 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%34 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %23 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.subi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
cf.br ^bb1(%33, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%35: tensor<i32>, %36: tensor<1x10xf32>, %37: tensor<1x10xf32>, %38: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %34 : tensor<i32>, tensor<i32>) outs(%24 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%119 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %119 : i1
} -> tensor<i1>
%40 = tensor.extract %39[] : tensor<i1>
cf.cond_br %40, ^bb2, ^bb3
^bb2: // pred: ^bb1
%41 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %cst_7 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%119 = arith.addi %arg2, %arg3 : i32
linalg.yield %119 : i32
} -> tensor<i32>
%42 = linalg.init_tensor [] : tensor<f32>
%43 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%42 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%119 = arith.index_cast %arg2 : i32 to index
%120 = tensor.extract %11[%119, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %120 : f32
} -> tensor<f32>
%44 = linalg.init_tensor [10] : tensor<10xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43 : tensor<f32>) outs(%44 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<10xf32>
%46 = linalg.init_tensor [10] : tensor<10xi1>
%47 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%45, %cst : tensor<10xf32>, tensor<10xf32>) outs(%46 : tensor<10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%119 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %119 : i1
} -> tensor<10xi1>
%48 = linalg.init_tensor [64] : tensor<64xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%35 : tensor<i32>) outs(%48 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%119 = arith.index_cast %arg2 : i32 to index
%120 = linalg.index 0 : index
%121 = tensor.extract %7[%119, %c0, %120] : tensor<5x1x64xf32>
linalg.yield %121 : f32
} -> tensor<64xf32>
%50 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%51 = tensor.insert_slice %49 into %50[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%52 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%53 = tensor.insert_slice %52 into %51[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%54 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%55 = linalg.fill ins(%cst_5 : f32) outs(%54 : tensor<1x40xf32>) -> tensor<1x40xf32>
%56 = linalg.matmul ins(%53, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%55 : tensor<1x40xf32>) -> tensor<1x40xf32>
%57 = tensor.collapse_shape %56 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
%58 = linalg.init_tensor [40] : tensor<40xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%58 : tensor<40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<40xf32>
%60 = tensor.expand_shape %59 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%61 = tensor.extract_slice %60[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%62 = linalg.init_tensor [10] : tensor<10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%64 = linalg.init_tensor [10] : tensor<10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63 : tensor<10xf32>) outs(%64 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%66 = linalg.init_tensor [10] : tensor<10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%66 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%68 = linalg.init_tensor [10] : tensor<10xf32>
%69 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%68 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%70 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%71 = linalg.init_tensor [10] : tensor<10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %70 : tensor<10xf32>, tensor<10xf32>) outs(%71 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%73 = tensor.extract_slice %60[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%74 = linalg.init_tensor [10] : tensor<10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%76 = linalg.init_tensor [10] : tensor<10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75 : tensor<10xf32>) outs(%76 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%78 = linalg.init_tensor [10] : tensor<10xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%78 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%80 = linalg.init_tensor [10] : tensor<10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%80 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%82 = tensor.extract_slice %60[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%83 = linalg.init_tensor [10] : tensor<10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%82 : tensor<10xf32>) outs(%83 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%85 = linalg.init_tensor [10] : tensor<10xf32>
%86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %84 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%87 = linalg.init_tensor [10] : tensor<10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %86 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%89 = linalg.init_tensor [10] : tensor<10xf32>
%90 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%89 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.minf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%91 = linalg.init_tensor [10] : tensor<10xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%90, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%91 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.maxf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%93 = tensor.collapse_shape %36 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%94 = linalg.init_tensor [10] : tensor<10xf32>
%95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %93, %92 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%119 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%96 = tensor.expand_shape %95 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%97 = tensor.extract_slice %60[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%98 = linalg.init_tensor [10] : tensor<10xf32>
%99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%100 = linalg.init_tensor [10] : tensor<10xf32>
%101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99 : tensor<10xf32>) outs(%100 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%102 = linalg.init_tensor [10] : tensor<10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%102 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%104 = linalg.init_tensor [10] : tensor<10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%103, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.addf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%106 = linalg.init_tensor [10] : tensor<10xf32>
%107 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%92 : tensor<10xf32>) outs(%106 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%119 = math.tanh %arg2 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%108 = linalg.init_tensor [10] : tensor<10xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%105, %107 : tensor<10xf32>, tensor<10xf32>) outs(%108 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%119 = arith.mulf %arg2, %arg3 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%110 = tensor.collapse_shape %37 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%111 = linalg.init_tensor [10] : tensor<10xf32>
%112 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%47, %110, %109 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%111 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%119 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %119 : f32
} -> tensor<10xf32>
%113 = tensor.expand_shape %112 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%114 = tensor.extract %35[] : tensor<i32>
%115 = arith.maxsi %114, %c0_i32 : i32
%116 = arith.minsi %115, %c4_i32 : i32
%117 = arith.index_cast %116 : i32 to index
%118 = tensor.insert_slice %112 into %38[%117, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%41, %96, %113, %118 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %38 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
%cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_4 = arith.constant 0x7F800000 : f32
%cst_5 = arith.constant 0.000000e+00 : f32
%cst_6 = arith.constant dense<0> : tensor<i32>
%cst_7 = arith.constant dense<1> : tensor<i32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_10 = arith.constant dense<5> : tensor<i32>
%cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.fill ins(%cst_4 : f32) outs(%8 : tensor<5xf32>) -> tensor<5xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%115 = arith.minf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<5xf32>
%11 = linalg.init_tensor [5] : tensor<5xi1>
%12 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%115 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %115 : i1
} -> tensor<5xi1>
%13 = linalg.init_tensor [5] : tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%12 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%115 = arith.extui %arg2 : i1 to i32
linalg.yield %115 : i32
} -> tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%115 = arith.muli %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%115 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<i32>
%19 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %18 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%115 = arith.subi %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<i32>
%20 = linalg.init_tensor [] : tensor<i1>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%19, %cst_10 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%115 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %115 : i1
} -> tensor<i1>
%22 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%10 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%115 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %115 : i1
} -> tensor<5xi1>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%115 = arith.extui %arg2 : i1 to i32
linalg.yield %115 : i32
} -> tensor<5xi32>
%25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%115 = arith.muli %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<5xi32>
%26 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%25 : tensor<5xi32>) outs(%26 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%115 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %27 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%115 = arith.subi %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%21, %cst_6, %28 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%115 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %115 : i32
} -> tensor<i32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %19 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%115 = arith.subi %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<i32>
cf.br ^bb1(%29, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%31: tensor<i32>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %30 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%115 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %115 : i1
} -> tensor<i1>
%36 = tensor.extract %35[] : tensor<i1>
cf.cond_br %36, ^bb2, ^bb3
^bb2: // pred: ^bb1
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %cst_7 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%115 = arith.addi %arg2, %arg3 : i32
linalg.yield %115 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<f32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31 : tensor<i32>) outs(%38 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%115 = arith.index_cast %arg2 : i32 to index
%116 = tensor.extract %7[%115, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %116 : f32
} -> tensor<f32>
%40 = linalg.init_tensor [10] : tensor<10xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%39 : tensor<f32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<10xf32>
%42 = linalg.init_tensor [10] : tensor<10xi1>
%43 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%41, %cst : tensor<10xf32>, tensor<10xf32>) outs(%42 : tensor<10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%115 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %115 : i1
} -> tensor<10xi1>
%44 = linalg.init_tensor [64] : tensor<64xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%31 : tensor<i32>) outs(%44 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%115 = arith.index_cast %arg2 : i32 to index
%116 = linalg.index 0 : index
%117 = tensor.extract %5[%115, %c0, %116] : tensor<5x1x64xf32>
linalg.yield %117 : f32
} -> tensor<64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%48 = tensor.collapse_shape %33 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%49 = tensor.insert_slice %48 into %47[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%50 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%51 = linalg.fill ins(%cst_5 : f32) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.matmul ins(%49, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
%54 = linalg.init_tensor [40] : tensor<40xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%53, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%54 : tensor<40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.addf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<40xf32>
%56 = tensor.expand_shape %55 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%57 = tensor.extract_slice %56[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%58 = linalg.init_tensor [10] : tensor<10xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%58 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%60 = linalg.init_tensor [10] : tensor<10xf32>
%61 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%59 : tensor<10xf32>) outs(%60 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%115 = math.tanh %arg2 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%62 = linalg.init_tensor [10] : tensor<10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%62 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%64 = linalg.init_tensor [10] : tensor<10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%64 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.addf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%66 = tensor.collapse_shape %32 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%67 = linalg.init_tensor [10] : tensor<10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65, %66 : tensor<10xf32>, tensor<10xf32>) outs(%67 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%69 = tensor.extract_slice %56[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%70 = linalg.init_tensor [10] : tensor<10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%70 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%72 = linalg.init_tensor [10] : tensor<10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%71 : tensor<10xf32>) outs(%72 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%115 = math.tanh %arg2 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%74 = linalg.init_tensor [10] : tensor<10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%74 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%76 = linalg.init_tensor [10] : tensor<10xf32>
%77 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%75, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%76 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.addf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%78 = tensor.extract_slice %56[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%79 = linalg.init_tensor [10] : tensor<10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%78 : tensor<10xf32>) outs(%79 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%115 = math.tanh %arg2 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%81 = linalg.init_tensor [10] : tensor<10xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %80 : tensor<10xf32>, tensor<10xf32>) outs(%81 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%83 = linalg.init_tensor [10] : tensor<10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%68, %82 : tensor<10xf32>, tensor<10xf32>) outs(%83 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.addf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%85 = linalg.init_tensor [10] : tensor<10xf32>
%86 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%84, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%85 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.minf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%87 = linalg.init_tensor [10] : tensor<10xf32>
%88 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%86, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%87 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.maxf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%89 = tensor.collapse_shape %32 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%90 = linalg.init_tensor [10] : tensor<10xf32>
%91 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %89, %88 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%90 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%115 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%92 = tensor.expand_shape %91 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%93 = tensor.extract_slice %56[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%94 = linalg.init_tensor [10] : tensor<10xf32>
%95 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%93, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%94 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%96 = linalg.init_tensor [10] : tensor<10xf32>
%97 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%95 : tensor<10xf32>) outs(%96 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%115 = math.tanh %arg2 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%98 = linalg.init_tensor [10] : tensor<10xf32>
%99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%98 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%100 = linalg.init_tensor [10] : tensor<10xf32>
%101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%99, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%100 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.addf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%102 = linalg.init_tensor [10] : tensor<10xf32>
%103 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%88 : tensor<10xf32>) outs(%102 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%115 = math.tanh %arg2 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%104 = linalg.init_tensor [10] : tensor<10xf32>
%105 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%101, %103 : tensor<10xf32>, tensor<10xf32>) outs(%104 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%115 = arith.mulf %arg2, %arg3 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%106 = tensor.collapse_shape %33 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%107 = linalg.init_tensor [10] : tensor<10xf32>
%108 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %106, %105 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%107 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%115 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %115 : f32
} -> tensor<10xf32>
%109 = tensor.expand_shape %108 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%110 = tensor.extract %31[] : tensor<i32>
%111 = arith.maxsi %110, %c0_i32 : i32
%112 = arith.minsi %111, %c4_i32 : i32
%113 = arith.index_cast %112 : i32 to index
%114 = tensor.insert_slice %108 into %34[%113, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%37, %92, %109, %114 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %34 : tensor<5x1x10xf32>
}
// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant dense<0.000000e+00> : tensor<10xf32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<40xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant dense<1.000000e+01> : tensor<10xf32>
%cst_2 = arith.constant dense<-1.000000e+01> : tensor<10xf32>
%cst_3 = arith.constant dense<5.000000e-01> : tensor<10xf32>
%c4_i32 = arith.constant 4 : i32
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_4 = arith.constant 0x7F800000 : f32
%cst_5 = arith.constant 0.000000e+00 : f32
%cst_6 = arith.constant dense<0> : tensor<i32>
%cst_7 = arith.constant dense<1> : tensor<i32>
%cst_8 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_9 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_10 = arith.constant dense<5> : tensor<i32>
%cst_11 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_12 = arith.constant dense<0.000000e+00> : tensor<5xf32>
%cst_13 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst_5 : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.fill ins(%cst_4 : f32) outs(%8 : tensor<5xf32>) -> tensor<5xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6 : tensor<5xf32>) outs(%9 : tensor<5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%91 = arith.minf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<5xf32>
%11 = linalg.init_tensor [5] : tensor<5xi1>
%12 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%10, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%91 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %91 : i1
} -> tensor<5xi1>
%13 = linalg.init_tensor [5] : tensor<5xi32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%12 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%91 = arith.extui %arg2 : i1 to i32
linalg.yield %91 : i32
} -> tensor<5xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%14, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%91 = arith.muli %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<5xi32>
%16 = linalg.init_tensor [] : tensor<i32>
%17 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%15 : tensor<5xi32>) outs(%17 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%91 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<i32>
%19 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %18 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%91 = arith.subi %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<i32>
%20 = linalg.init_tensor [] : tensor<i1>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%19, %cst_10 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%91 = arith.cmpi eq, %arg2, %arg3 : i32
linalg.yield %91 : i1
} -> tensor<i1>
%22 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%10 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22, %cst_12 : tensor<5xf32>, tensor<5xf32>) outs(%11 : tensor<5xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%91 = arith.cmpf oeq, %arg2, %arg3 : f32
linalg.yield %91 : i1
} -> tensor<5xi1>
%24 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%23 : tensor<5xi1>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i1, %arg3: i32):
%91 = arith.extui %arg2 : i1 to i32
linalg.yield %91 : i32
} -> tensor<5xi32>
%25 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%24, %cst_11 : tensor<5xi32>, tensor<5xi32>) outs(%13 : tensor<5xi32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%91 = arith.muli %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<5xi32>
%26 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%16 : tensor<i32>) -> tensor<i32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%25 : tensor<5xi32>) outs(%26 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%91 = arith.maxsi %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<i32>
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %27 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%91 = arith.subi %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<i32>
%29 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%21, %cst_6, %28 : tensor<i1>, tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i1, %arg3: i32, %arg4: i32, %arg5: i32):
%91 = arith.select %arg2, %arg3, %arg4 : i32
linalg.yield %91 : i32
} -> tensor<i32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%cst_10, %19 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%91 = arith.subi %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<i32>
cf.br ^bb1(%29, %cst_13, %cst_13, %cst_8 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%31: tensor<i32>, %32: tensor<1x10xf32>, %33: tensor<1x10xf32>, %34: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%35 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %30 : tensor<i32>, tensor<i32>) outs(%20 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%91 = arith.cmpi slt, %arg2, %arg3 : i32
linalg.yield %91 : i1
} -> tensor<i1>
%36 = tensor.extract %35[] : tensor<i1>
cf.cond_br %36, ^bb2, ^bb3
^bb2: // pred: ^bb1
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31, %cst_7 : tensor<i32>, tensor<i32>) outs(%16 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%91 = arith.addi %arg2, %arg3 : i32
linalg.yield %91 : i32
} -> tensor<i32>
%38 = linalg.init_tensor [] : tensor<f32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%31 : tensor<i32>) outs(%38 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%91 = arith.index_cast %arg2 : i32 to index
%92 = tensor.extract %7[%91, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %92 : f32
} -> tensor<f32>
%40 = linalg.init_tensor [10] : tensor<10xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%39 : tensor<f32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
} -> tensor<10xf32>
%42 = linalg.init_tensor [10] : tensor<10xi1>
%43 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%41, %cst : tensor<10xf32>, tensor<10xf32>) outs(%42 : tensor<10xi1>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: i1):
%91 = arith.cmpf ogt, %arg2, %arg3 : f32
linalg.yield %91 : i1
} -> tensor<10xi1>
%44 = linalg.init_tensor [64] : tensor<64xf32>
%45 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%31 : tensor<i32>) outs(%44 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%91 = arith.index_cast %arg2 : i32 to index
%92 = linalg.index 0 : index
%93 = tensor.extract %5[%91, %c0, %92] : tensor<5x1x64xf32>
linalg.yield %93 : f32
} -> tensor<64xf32>
%46 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%47 = tensor.insert_slice %45 into %46[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%48 = tensor.collapse_shape %33 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%49 = tensor.insert_slice %48 into %47[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%50 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%51 = linalg.fill ins(%cst_5 : f32) outs(%50 : tensor<1x40xf32>) -> tensor<1x40xf32>
%52 = linalg.matmul ins(%49, %cst_9 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%51 : tensor<1x40xf32>) -> tensor<1x40xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x40xf32> into tensor<40xf32>
%54 = linalg.init_tensor [40] : tensor<40xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%53, %cst_0 : tensor<40xf32>, tensor<40xf32>) outs(%54 : tensor<40xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.addf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<40xf32>
%56 = tensor.expand_shape %55 [[0, 1]] : tensor<40xf32> into tensor<1x40xf32>
%57 = tensor.extract_slice %56[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%57, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%59 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%58 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%91 = math.tanh %arg2 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%60 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%59, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%61 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%60, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.addf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%62 = tensor.collapse_shape %32 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%63 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%61, %62 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%64 = tensor.extract_slice %56[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%65 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%64, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%66 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%65 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%91 = math.tanh %arg2 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%67 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%66, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%67, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.addf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%69 = tensor.extract_slice %56[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%70 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%69 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%91 = math.tanh %arg2 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%68, %70 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%72 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%63, %71 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.addf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%73 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%72, %cst_1 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.minf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%74 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%73, %cst_2 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.maxf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%75 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %62, %74 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%91 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%76 = tensor.expand_shape %75 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%77 = tensor.extract_slice %56[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%78 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%77, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%78 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%91 = math.tanh %arg2 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%80 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%79, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%81 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%80, %cst_3 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.addf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%74 : tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%91 = math.tanh %arg2 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%83 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%81, %82 : tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%91 = arith.mulf %arg2, %arg3 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%84 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%43, %48, %83 : tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) outs(%40 : tensor<10xf32>) {
^bb0(%arg2: i1, %arg3: f32, %arg4: f32, %arg5: f32):
%91 = arith.select %arg2, %arg3, %arg4 : f32
linalg.yield %91 : f32
} -> tensor<10xf32>
%85 = tensor.expand_shape %84 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%86 = tensor.extract %31[] : tensor<i32>
%87 = arith.maxsi %86, %c0_i32 : i32
%88 = arith.minsi %87, %c4_i32 : i32
%89 = arith.index_cast %88 : i32 to index
%90 = tensor.insert_slice %84 into %34[%89, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%37, %76, %85, %90 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %34 : tensor<5x1x10xf32>
}
// -----// IR Dump After FusionOfTensorOps //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant 1.000000e+01 : f32
%cst_1 = arith.constant -1.000000e+01 : f32
%cst_2 = arith.constant 5.000000e-01 : f32
%c0 = arith.constant 0 : index
%c4_i32 = arith.constant 4 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0x7F800000 : f32
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.fill ins(%cst_3 : f32) outs(%8 : tensor<5xf32>) -> tensor<5xf32>
%10 = tensor.expand_shape %9 [[0, 1]] : tensor<5xf32> into tensor<1x5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%62 = arith.minf %arg2, %arg3 : f32
linalg.yield %62 : f32
} -> tensor<1x5xf32>
%12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%13 = linalg.init_tensor [] : tensor<i32>
%14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%62 = arith.cmpf oeq, %arg2, %cst : f32
%63 = arith.extui %62 : i1 to i32
%64 = arith.muli %63, %arg3 : i32
%65 = arith.maxsi %64, %arg4 : i32
linalg.yield %65 : i32
} -> tensor<i32>
%16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%62 = arith.subi %c5_i32, %arg2 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%17 = linalg.init_tensor [] : tensor<i1>
%18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%62 = arith.cmpf oeq, %arg2, %cst : f32
%63 = arith.extui %62 : i1 to i32
%64 = arith.muli %63, %arg3 : i32
%65 = arith.maxsi %64, %arg4 : i32
linalg.yield %65 : i32
} -> tensor<i32>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%62 = arith.subi %c5_i32, %arg3 : i32
%63 = arith.cmpi eq, %arg2, %c5_i32 : i32
%64 = arith.select %63, %c0_i32, %62 : i32
linalg.yield %64 : i32
} -> tensor<i32>
cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%62 = arith.subi %c5_i32, %arg3 : i32
%63 = arith.cmpi slt, %arg2, %62 : i32
linalg.yield %63 : i1
} -> tensor<i1>
%27 = tensor.extract %26[] : tensor<i1>
cf.cond_br %27, ^bb2, ^bb3
^bb2: // pred: ^bb1
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%62 = arith.addi %arg2, %c1_i32 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%29 = linalg.init_tensor [] : tensor<f32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%62 = arith.index_cast %arg2 : i32 to index
%63 = tensor.extract %7[%62, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %63 : f32
} -> tensor<f32>
%31 = linalg.init_tensor [64] : tensor<64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%62 = arith.index_cast %arg2 : i32 to index
%63 = linalg.index 0 : index
%64 = tensor.extract %5[%62, %c0, %63] : tensor<5x1x64xf32>
linalg.yield %64 : f32
} -> tensor<64xf32>
%33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
%40 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%40 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%62 = arith.addf %arg2, %cst : f32
linalg.yield %62 : f32
} -> tensor<1x40xf32>
%42 = tensor.extract_slice %41[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%43 = tensor.extract_slice %41[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%44 = tensor.extract_slice %41[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%47 = tensor.expand_shape %44 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%48 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %23, %46, %47 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%48 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%62 = math.tanh %arg5 : f32
%63 = arith.mulf %arg2, %cst_2 : f32
%64 = math.tanh %63 : f32
%65 = arith.mulf %64, %cst_2 : f32
%66 = arith.addf %65, %cst_2 : f32
%67 = arith.mulf %arg4, %cst_2 : f32
%68 = math.tanh %67 : f32
%69 = arith.mulf %68, %cst_2 : f32
%70 = arith.addf %69, %cst_2 : f32
%71 = arith.mulf %70, %62 : f32
%72 = arith.mulf %66, %arg3 : f32
%73 = arith.addf %72, %71 : f32
%74 = arith.minf %73, %cst_0 : f32
%75 = arith.maxf %74, %cst_1 : f32
linalg.yield %75 : f32
} -> tensor<1x10xf32>
%50 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%50 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
%62 = arith.cmpf ogt, %arg2, %cst : f32
%63 = arith.select %62, %arg3, %arg4 : f32
linalg.yield %63 : f32
} -> tensor<1x10xf32>
%52 = tensor.extract_slice %41[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%53 = tensor.expand_shape %52 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %53, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%62 = math.tanh %arg5 : f32
%63 = arith.mulf %arg4, %cst_2 : f32
%64 = math.tanh %63 : f32
%65 = arith.mulf %64, %cst_2 : f32
%66 = arith.addf %65, %cst_2 : f32
%67 = arith.mulf %66, %62 : f32
%68 = arith.cmpf ogt, %arg2, %cst : f32
%69 = arith.select %68, %arg3, %67 : f32
linalg.yield %69 : f32
} -> tensor<1x10xf32>
%56 = tensor.collapse_shape %55 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%57 = tensor.extract %22[] : tensor<i32>
%58 = arith.maxsi %57, %c0_i32 : i32
%59 = arith.minsi %58, %c4_i32 : i32
%60 = arith.index_cast %59 : i32 to index
%61 = tensor.insert_slice %56 into %25[%60, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%28, %51, %55, %61 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %25 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant 1.000000e+01 : f32
%cst_1 = arith.constant -1.000000e+01 : f32
%cst_2 = arith.constant 5.000000e-01 : f32
%c0 = arith.constant 0 : index
%c4_i32 = arith.constant 4 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0x7F800000 : f32
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
%10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%62 = arith.minf %arg2, %arg3 : f32
linalg.yield %62 : f32
} -> tensor<1x5xf32>
%12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%13 = linalg.init_tensor [] : tensor<i32>
%14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%62 = arith.cmpf oeq, %arg2, %cst : f32
%63 = arith.extui %62 : i1 to i32
%64 = arith.muli %63, %arg3 : i32
%65 = arith.maxsi %64, %arg4 : i32
linalg.yield %65 : i32
} -> tensor<i32>
%16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%62 = arith.subi %c5_i32, %arg2 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%17 = linalg.init_tensor [] : tensor<i1>
%18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%62 = arith.cmpf oeq, %arg2, %cst : f32
%63 = arith.extui %62 : i1 to i32
%64 = arith.muli %63, %arg3 : i32
%65 = arith.maxsi %64, %arg4 : i32
linalg.yield %65 : i32
} -> tensor<i32>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%62 = arith.subi %c5_i32, %arg3 : i32
%63 = arith.cmpi eq, %arg2, %c5_i32 : i32
%64 = arith.select %63, %c0_i32, %62 : i32
linalg.yield %64 : i32
} -> tensor<i32>
cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%62 = arith.subi %c5_i32, %arg3 : i32
%63 = arith.cmpi slt, %arg2, %62 : i32
linalg.yield %63 : i1
} -> tensor<i1>
%27 = tensor.extract %26[] : tensor<i1>
cf.cond_br %27, ^bb2, ^bb3
^bb2: // pred: ^bb1
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%62 = arith.addi %arg2, %c1_i32 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%29 = linalg.init_tensor [] : tensor<f32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%62 = arith.index_cast %arg2 : i32 to index
%63 = tensor.extract %7[%62, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %63 : f32
} -> tensor<f32>
%31 = linalg.init_tensor [64] : tensor<64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%62 = arith.index_cast %arg2 : i32 to index
%63 = linalg.index 0 : index
%64 = tensor.extract %5[%62, %c0, %63] : tensor<5x1x64xf32>
linalg.yield %64 : f32
} -> tensor<64xf32>
%33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
%40 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%40 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%62 = arith.addf %arg2, %cst : f32
linalg.yield %62 : f32
} -> tensor<1x40xf32>
%42 = tensor.extract_slice %41[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%43 = tensor.extract_slice %41[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%44 = tensor.extract_slice %41[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%47 = tensor.expand_shape %44 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%48 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%45, %23, %46, %47 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%48 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%62 = math.tanh %arg5 : f32
%63 = arith.mulf %arg2, %cst_2 : f32
%64 = math.tanh %63 : f32
%65 = arith.mulf %64, %cst_2 : f32
%66 = arith.addf %65, %cst_2 : f32
%67 = arith.mulf %arg4, %cst_2 : f32
%68 = math.tanh %67 : f32
%69 = arith.mulf %68, %cst_2 : f32
%70 = arith.addf %69, %cst_2 : f32
%71 = arith.mulf %70, %62 : f32
%72 = arith.mulf %66, %arg3 : f32
%73 = arith.addf %72, %71 : f32
%74 = arith.minf %73, %cst_0 : f32
%75 = arith.maxf %74, %cst_1 : f32
linalg.yield %75 : f32
} -> tensor<1x10xf32>
%50 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%50 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
%62 = arith.cmpf ogt, %arg2, %cst : f32
%63 = arith.select %62, %arg3, %arg4 : f32
linalg.yield %63 : f32
} -> tensor<1x10xf32>
%52 = tensor.extract_slice %41[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%53 = tensor.expand_shape %52 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%54 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %53, %49 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%54 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%62 = math.tanh %arg5 : f32
%63 = arith.mulf %arg4, %cst_2 : f32
%64 = math.tanh %63 : f32
%65 = arith.mulf %64, %cst_2 : f32
%66 = arith.addf %65, %cst_2 : f32
%67 = arith.mulf %66, %62 : f32
%68 = arith.cmpf ogt, %arg2, %cst : f32
%69 = arith.select %68, %arg3, %67 : f32
linalg.yield %69 : f32
} -> tensor<1x10xf32>
%56 = tensor.collapse_shape %55 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%57 = tensor.extract %22[] : tensor<i32>
%58 = arith.maxsi %57, %c0_i32 : i32
%59 = arith.minsi %58, %c4_i32 : i32
%60 = arith.index_cast %59 : i32 to index
%61 = tensor.insert_slice %56 into %25[%60, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%28, %51, %55, %61 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %25 : tensor<5x1x10xf32>
}
// -----// IR Dump After CSE //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant 1.000000e+01 : f32
%cst_1 = arith.constant -1.000000e+01 : f32
%cst_2 = arith.constant 5.000000e-01 : f32
%c0 = arith.constant 0 : index
%c4_i32 = arith.constant 4 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0x7F800000 : f32
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
%10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%59 = arith.minf %arg2, %arg3 : f32
linalg.yield %59 : f32
} -> tensor<1x5xf32>
%12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%13 = linalg.init_tensor [] : tensor<i32>
%14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%59 = arith.cmpf oeq, %arg2, %cst : f32
%60 = arith.extui %59 : i1 to i32
%61 = arith.muli %60, %arg3 : i32
%62 = arith.maxsi %61, %arg4 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%59 = arith.subi %c5_i32, %arg2 : i32
linalg.yield %59 : i32
} -> tensor<i32>
%17 = linalg.init_tensor [] : tensor<i1>
%18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%59 = arith.cmpf oeq, %arg2, %cst : f32
%60 = arith.extui %59 : i1 to i32
%61 = arith.muli %60, %arg3 : i32
%62 = arith.maxsi %61, %arg4 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%59 = arith.subi %c5_i32, %arg3 : i32
%60 = arith.cmpi eq, %arg2, %c5_i32 : i32
%61 = arith.select %60, %c0_i32, %59 : i32
linalg.yield %61 : i32
} -> tensor<i32>
cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%59 = arith.subi %c5_i32, %arg3 : i32
%60 = arith.cmpi slt, %arg2, %59 : i32
linalg.yield %60 : i1
} -> tensor<i1>
%27 = tensor.extract %26[] : tensor<i1>
cf.cond_br %27, ^bb2, ^bb3
^bb2: // pred: ^bb1
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%59 = arith.addi %arg2, %c1_i32 : i32
linalg.yield %59 : i32
} -> tensor<i32>
%29 = linalg.init_tensor [] : tensor<f32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%59 = arith.index_cast %arg2 : i32 to index
%60 = tensor.extract %7[%59, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %60 : f32
} -> tensor<f32>
%31 = linalg.init_tensor [64] : tensor<64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%59 = arith.index_cast %arg2 : i32 to index
%60 = linalg.index 0 : index
%61 = tensor.extract %5[%59, %c0, %60] : tensor<5x1x64xf32>
linalg.yield %61 : f32
} -> tensor<64xf32>
%33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%37 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%59 = arith.addf %arg2, %cst : f32
linalg.yield %59 : f32
} -> tensor<1x40xf32>
%41 = tensor.extract_slice %40[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%42 = tensor.extract_slice %40[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%43 = tensor.extract_slice %40[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%44 = tensor.expand_shape %41 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%47 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %23, %45, %46 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%59 = math.tanh %arg5 : f32
%60 = arith.mulf %arg2, %cst_2 : f32
%61 = math.tanh %60 : f32
%62 = arith.mulf %61, %cst_2 : f32
%63 = arith.addf %62, %cst_2 : f32
%64 = arith.mulf %arg4, %cst_2 : f32
%65 = math.tanh %64 : f32
%66 = arith.mulf %65, %cst_2 : f32
%67 = arith.addf %66, %cst_2 : f32
%68 = arith.mulf %67, %59 : f32
%69 = arith.mulf %63, %arg3 : f32
%70 = arith.addf %69, %68 : f32
%71 = arith.minf %70, %cst_0 : f32
%72 = arith.maxf %71, %cst_1 : f32
linalg.yield %72 : f32
} -> tensor<1x10xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
%59 = arith.cmpf ogt, %arg2, %cst : f32
%60 = arith.select %59, %arg3, %arg4 : f32
linalg.yield %60 : f32
} -> tensor<1x10xf32>
%50 = tensor.extract_slice %40[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%51 = tensor.expand_shape %50 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %51, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%59 = math.tanh %arg5 : f32
%60 = arith.mulf %arg4, %cst_2 : f32
%61 = math.tanh %60 : f32
%62 = arith.mulf %61, %cst_2 : f32
%63 = arith.addf %62, %cst_2 : f32
%64 = arith.mulf %63, %59 : f32
%65 = arith.cmpf ogt, %arg2, %cst : f32
%66 = arith.select %65, %arg3, %64 : f32
linalg.yield %66 : f32
} -> tensor<1x10xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%54 = tensor.extract %22[] : tensor<i32>
%55 = arith.maxsi %54, %c0_i32 : i32
%56 = arith.minsi %55, %c4_i32 : i32
%57 = arith.index_cast %56 : i32 to index
%58 = tensor.insert_slice %53 into %25[%57, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%28, %49, %52, %58 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %25 : tensor<5x1x10xf32>
}
// -----// IR Dump After SplitReduction //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant 1.000000e+01 : f32
%cst_1 = arith.constant -1.000000e+01 : f32
%cst_2 = arith.constant 5.000000e-01 : f32
%c0 = arith.constant 0 : index
%c4_i32 = arith.constant 4 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0x7F800000 : f32
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
%10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%59 = arith.minf %arg2, %arg3 : f32
linalg.yield %59 : f32
} -> tensor<1x5xf32>
%12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%13 = linalg.init_tensor [] : tensor<i32>
%14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%59 = arith.cmpf oeq, %arg2, %cst : f32
%60 = arith.extui %59 : i1 to i32
%61 = arith.muli %60, %arg3 : i32
%62 = arith.maxsi %61, %arg4 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%59 = arith.subi %c5_i32, %arg2 : i32
linalg.yield %59 : i32
} -> tensor<i32>
%17 = linalg.init_tensor [] : tensor<i1>
%18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%59 = arith.cmpf oeq, %arg2, %cst : f32
%60 = arith.extui %59 : i1 to i32
%61 = arith.muli %60, %arg3 : i32
%62 = arith.maxsi %61, %arg4 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%59 = arith.subi %c5_i32, %arg3 : i32
%60 = arith.cmpi eq, %arg2, %c5_i32 : i32
%61 = arith.select %60, %c0_i32, %59 : i32
linalg.yield %61 : i32
} -> tensor<i32>
cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%59 = arith.subi %c5_i32, %arg3 : i32
%60 = arith.cmpi slt, %arg2, %59 : i32
linalg.yield %60 : i1
} -> tensor<i1>
%27 = tensor.extract %26[] : tensor<i1>
cf.cond_br %27, ^bb2, ^bb3
^bb2: // pred: ^bb1
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%59 = arith.addi %arg2, %c1_i32 : i32
linalg.yield %59 : i32
} -> tensor<i32>
%29 = linalg.init_tensor [] : tensor<f32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%59 = arith.index_cast %arg2 : i32 to index
%60 = tensor.extract %7[%59, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %60 : f32
} -> tensor<f32>
%31 = linalg.init_tensor [64] : tensor<64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%59 = arith.index_cast %arg2 : i32 to index
%60 = linalg.index 0 : index
%61 = tensor.extract %5[%59, %c0, %60] : tensor<5x1x64xf32>
linalg.yield %61 : f32
} -> tensor<64xf32>
%33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%37 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%59 = arith.addf %arg2, %cst : f32
linalg.yield %59 : f32
} -> tensor<1x40xf32>
%41 = tensor.extract_slice %40[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%42 = tensor.extract_slice %40[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%43 = tensor.extract_slice %40[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%44 = tensor.expand_shape %41 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%47 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %23, %45, %46 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%59 = math.tanh %arg5 : f32
%60 = arith.mulf %arg2, %cst_2 : f32
%61 = math.tanh %60 : f32
%62 = arith.mulf %61, %cst_2 : f32
%63 = arith.addf %62, %cst_2 : f32
%64 = arith.mulf %arg4, %cst_2 : f32
%65 = math.tanh %64 : f32
%66 = arith.mulf %65, %cst_2 : f32
%67 = arith.addf %66, %cst_2 : f32
%68 = arith.mulf %67, %59 : f32
%69 = arith.mulf %63, %arg3 : f32
%70 = arith.addf %69, %68 : f32
%71 = arith.minf %70, %cst_0 : f32
%72 = arith.maxf %71, %cst_1 : f32
linalg.yield %72 : f32
} -> tensor<1x10xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
%59 = arith.cmpf ogt, %arg2, %cst : f32
%60 = arith.select %59, %arg3, %arg4 : f32
linalg.yield %60 : f32
} -> tensor<1x10xf32>
%50 = tensor.extract_slice %40[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%51 = tensor.expand_shape %50 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %51, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%59 = math.tanh %arg5 : f32
%60 = arith.mulf %arg4, %cst_2 : f32
%61 = math.tanh %60 : f32
%62 = arith.mulf %61, %cst_2 : f32
%63 = arith.addf %62, %cst_2 : f32
%64 = arith.mulf %63, %59 : f32
%65 = arith.cmpf ogt, %arg2, %cst : f32
%66 = arith.select %65, %arg3, %64 : f32
linalg.yield %66 : f32
} -> tensor<1x10xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%54 = tensor.extract %22[] : tensor<i32>
%55 = arith.maxsi %54, %c0_i32 : i32
%56 = arith.minsi %55, %c4_i32 : i32
%57 = arith.index_cast %56 : i32 to index
%58 = tensor.insert_slice %53 into %25[%57, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%28, %49, %52, %58 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %25 : tensor<5x1x10xf32>
}
// -----// IR Dump After InterchangeGenericOps //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%c1_i32 = arith.constant 1 : i32
%cst_0 = arith.constant 1.000000e+01 : f32
%cst_1 = arith.constant -1.000000e+01 : f32
%cst_2 = arith.constant 5.000000e-01 : f32
%c0 = arith.constant 0 : index
%c4_i32 = arith.constant 4 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0x7F800000 : f32
%cst_4 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_5 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%cst_6 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = linalg.init_tensor [1, 5, 64] : tensor<1x5x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x5x64xf32>) -> tensor<1x5x64xf32>
%2 = tensor.collapse_shape %arg1 [[0, 1], [2, 3]] : tensor<1x5x2x2xf32> into tensor<5x4xf32>
%3 = tensor.insert_slice %2 into %1[0, 0, 0] [1, 5, 4] [1, 1, 1] : tensor<5x4xf32> into tensor<1x5x64xf32>
%4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x5x64xf32> into tensor<5x64xf32>
%5 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<5x64xf32> into tensor<5x1x64xf32>
%6 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%7 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<5xf32> into tensor<5x1x1xf32>
%8 = linalg.init_tensor [5] : tensor<5xf32>
%9 = linalg.init_tensor [1, 5] : tensor<1x5xf32>
%10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<1x5xf32>) -> tensor<1x5xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x5xf32>) outs(%10 : tensor<1x5xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%59 = arith.minf %arg2, %arg3 : f32
linalg.yield %59 : f32
} -> tensor<1x5xf32>
%12 = tensor.collapse_shape %11 [[0, 1]] : tensor<1x5xf32> into tensor<5xf32>
%13 = linalg.init_tensor [] : tensor<i32>
%14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%12, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%14 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%59 = arith.cmpf oeq, %arg2, %cst : f32
%60 = arith.extui %59 : i1 to i32
%61 = arith.muli %60, %arg3 : i32
%62 = arith.maxsi %61, %arg4 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%16 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%15 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%59 = arith.subi %c5_i32, %arg2 : i32
linalg.yield %59 : i32
} -> tensor<i32>
%17 = linalg.init_tensor [] : tensor<i1>
%18 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%12 : tensor<5xf32>) outs(%8 : tensor<5xf32>) : tensor<5xf32>
%19 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%13 : tensor<i32>) -> tensor<i32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%18, %cst_6 : tensor<5xf32>, tensor<5xi32>) outs(%19 : tensor<i32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: i32):
%59 = arith.cmpf oeq, %arg2, %cst : f32
%60 = arith.extui %59 : i1 to i32
%61 = arith.muli %60, %arg3 : i32
%62 = arith.maxsi %61, %arg4 : i32
linalg.yield %62 : i32
} -> tensor<i32>
%21 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%16, %20 : tensor<i32>, tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i32):
%59 = arith.subi %c5_i32, %arg3 : i32
%60 = arith.cmpi eq, %arg2, %c5_i32 : i32
%61 = arith.select %60, %c0_i32, %59 : i32
linalg.yield %61 : i32
} -> tensor<i32>
cf.br ^bb1(%21, %cst_7, %cst_7, %cst_4 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%22: tensor<i32>, %23: tensor<1x10xf32>, %24: tensor<1x10xf32>, %25: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%26 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22, %16 : tensor<i32>, tensor<i32>) outs(%17 : tensor<i1>) {
^bb0(%arg2: i32, %arg3: i32, %arg4: i1):
%59 = arith.subi %c5_i32, %arg3 : i32
%60 = arith.cmpi slt, %arg2, %59 : i32
linalg.yield %60 : i1
} -> tensor<i1>
%27 = tensor.extract %26[] : tensor<i1>
cf.cond_br %27, ^bb2, ^bb3
^bb2: // pred: ^bb1
%28 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%13 : tensor<i32>) {
^bb0(%arg2: i32, %arg3: i32):
%59 = arith.addi %arg2, %c1_i32 : i32
linalg.yield %59 : i32
} -> tensor<i32>
%29 = linalg.init_tensor [] : tensor<f32>
%30 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%22 : tensor<i32>) outs(%29 : tensor<f32>) {
^bb0(%arg2: i32, %arg3: f32):
%59 = arith.index_cast %arg2 : i32 to index
%60 = tensor.extract %7[%59, %c0, %c0] : tensor<5x1x1xf32>
linalg.yield %60 : f32
} -> tensor<f32>
%31 = linalg.init_tensor [64] : tensor<64xf32>
%32 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%22 : tensor<i32>) outs(%31 : tensor<64xf32>) {
^bb0(%arg2: i32, %arg3: f32):
%59 = arith.index_cast %arg2 : i32 to index
%60 = linalg.index 0 : index
%61 = tensor.extract %5[%59, %c0, %60] : tensor<5x1x64xf32>
linalg.yield %61 : f32
} -> tensor<64xf32>
%33 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%34 = tensor.insert_slice %32 into %33[0, 0] [1, 64] [1, 1] : tensor<64xf32> into tensor<1x74xf32>
%35 = tensor.collapse_shape %24 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%36 = tensor.insert_slice %35 into %34[0, 64] [1, 10] [1, 1] : tensor<10xf32> into tensor<1x74xf32>
%37 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%38 = linalg.fill ins(%cst : f32) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.matmul ins(%36, %cst_5 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%38 : tensor<1x40xf32>) -> tensor<1x40xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%39 : tensor<1x40xf32>) outs(%37 : tensor<1x40xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%59 = arith.addf %arg2, %cst : f32
linalg.yield %59 : f32
} -> tensor<1x40xf32>
%41 = tensor.extract_slice %40[0, 20] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%42 = tensor.extract_slice %40[0, 10] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%43 = tensor.extract_slice %40[0, 0] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%44 = tensor.expand_shape %41 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%45 = tensor.expand_shape %42 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%46 = tensor.expand_shape %43 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%47 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%44, %23, %45, %46 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%59 = math.tanh %arg5 : f32
%60 = arith.mulf %arg2, %cst_2 : f32
%61 = math.tanh %60 : f32
%62 = arith.mulf %61, %cst_2 : f32
%63 = arith.addf %62, %cst_2 : f32
%64 = arith.mulf %arg4, %cst_2 : f32
%65 = math.tanh %64 : f32
%66 = arith.mulf %65, %cst_2 : f32
%67 = arith.addf %66, %cst_2 : f32
%68 = arith.mulf %67, %59 : f32
%69 = arith.mulf %63, %arg3 : f32
%70 = arith.addf %69, %68 : f32
%71 = arith.minf %70, %cst_0 : f32
%72 = arith.maxf %71, %cst_1 : f32
linalg.yield %72 : f32
} -> tensor<1x10xf32>
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %23, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32):
%59 = arith.cmpf ogt, %arg2, %cst : f32
%60 = arith.select %59, %arg3, %arg4 : f32
linalg.yield %60 : f32
} -> tensor<1x10xf32>
%50 = tensor.extract_slice %40[0, 30] [1, 10] [1, 1] : tensor<1x40xf32> to tensor<10xf32>
%51 = tensor.expand_shape %50 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
%52 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%30, %24, %51, %48 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%47 : tensor<1x10xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32):
%59 = math.tanh %arg5 : f32
%60 = arith.mulf %arg4, %cst_2 : f32
%61 = math.tanh %60 : f32
%62 = arith.mulf %61, %cst_2 : f32
%63 = arith.addf %62, %cst_2 : f32
%64 = arith.mulf %63, %59 : f32
%65 = arith.cmpf ogt, %arg2, %cst : f32
%66 = arith.select %65, %arg3, %64 : f32
linalg.yield %66 : f32
} -> tensor<1x10xf32>
%53 = tensor.collapse_shape %52 [[0, 1]] : tensor<1x10xf32> into tensor<10xf32>
%54 = tensor.extract %22[] : tensor<i32>
%55 = arith.maxsi %54, %c0_i32 : i32
%56 = arith.minsi %55, %c4_i32 : i32
%57 = arith.index_cast %56 : i32 to index
%58 = tensor.insert_slice %53 into %25[%57, 0, 0] [1, 1, 10] [1, 1, 1] : tensor<10xf32> into tensor<5x1x10xf32>
cf.br ^bb1(%28, %49, %52, %58 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %25 : tensor<5x1x10xf32>
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c20 = arith.constant 20 : index
%c0 = arith.constant 0 : index
%c30 = arith.constant 30 : index
%c64 = arith.constant 64 : index
%c40 = arith.constant 40 : index
%c10 = arith.constant 10 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = flow.tensor.splat %cst : tensor<1x5x64xf32>
%1 = flow.tensor.reshape %arg1 : tensor<1x5x2x2xf32> -> tensor<5x4xf32>
%2 = flow.dispatch.workgroups[%c4, %c5, %c1](%1, %0) : (tensor<5x4xf32>, tensor<1x5x64xf32>) -> %0 =
(%arg2: !flow.dispatch.tensor<readonly:5x4xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5x64xf32>) {
%c0_3 = arith.constant 0 : index
%c1_4 = arith.constant 1 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [5, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x4xf32> -> tensor<5x4xf32>
flow.dispatch.tensor.store %35, %arg3, offsets = [%c0_3, %c0_3, %c0_3], sizes = [1, 5, 4], strides = [%c1_4, %c1_4, %c1_4] : tensor<5x4xf32> -> !flow.dispatch.tensor<readwrite:1x5x64xf32>
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<1x5x64xf32> -> tensor<5x1x64xf32>
%4 = flow.tensor.reshape %arg0 : tensor<1x5xf32> -> tensor<5x1x1xf32>
%5 = flow.tensor.splat %cst_0 : tensor<1x5xf32>
%6 = flow.dispatch.workgroups[%c5, %c1, %c1](%arg0, %5) : (tensor<1x5xf32>, tensor<1x5xf32>) -> %5 =
(%arg2: !flow.dispatch.tensor<readonly:1x5xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5xf32>) {
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x5xf32> -> tensor<1x5xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x5xf32> -> tensor<1x5xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<1x5xf32>) outs(%36 : tensor<1x5xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%38 = arith.minf %arg4, %arg5 : f32
linalg.yield %38 : f32
} -> tensor<1x5xf32>
flow.dispatch.tensor.store %37, %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : tensor<1x5xf32> -> !flow.dispatch.tensor<readwrite:1x5xf32>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<1x5xf32> -> tensor<5xf32>
%8 = flow.dispatch.workgroups[%c1, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%c5_i32 = arith.constant 5 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%36 : tensor<i32>) -> tensor<i32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%37 : tensor<i32>) {
^bb0(%arg4: f32, %arg5: i32, %arg6: i32):
%40 = arith.cmpf oeq, %arg4, %cst_3 : f32
%41 = arith.extui %40 : i1 to i32
%42 = arith.muli %41, %arg5 : i32
%43 = arith.maxsi %42, %arg6 : i32
linalg.yield %43 : i32
} -> tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%38 : tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg4: i32, %arg5: i32):
%40 = arith.subi %c5_i32, %arg4 : i32
linalg.yield %40 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %39, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%9 = flow.dispatch.workgroups[%c5, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<5xf32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:5xf32>) {
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = linalg.init_tensor [5] : tensor<5xf32>
%37 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%35 : tensor<5xf32>) outs(%36 : tensor<5xf32>) : tensor<5xf32>
flow.dispatch.tensor.store %37, %arg3, offsets = [0], sizes = [5], strides = [1] : tensor<5xf32> -> !flow.dispatch.tensor<writeonly:5xf32>
flow.return
}
%10 = flow.dispatch.workgroups[%c1, %c1, %c1](%9, %8) : (tensor<5xf32>, tensor<i32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i32>) {
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<i32>
%38 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%37 : tensor<i32>) -> tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%38 : tensor<i32>) {
^bb0(%arg5: f32, %arg6: i32, %arg7: i32):
%41 = arith.cmpf oeq, %arg5, %cst_3 : f32
%42 = arith.extui %41 : i1 to i32
%43 = arith.muli %42, %arg6 : i32
%44 = arith.maxsi %43, %arg7 : i32
linalg.yield %44 : i32
} -> tensor<i32>
%40 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36, %39 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i32>) {
^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
%41 = arith.subi %c5_i32, %arg6 : i32
%42 = arith.cmpi eq, %arg5, %c5_i32 : i32
%43 = arith.select %42, %c0_i32, %41 : i32
linalg.yield %43 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %40, %arg4, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
cf.br ^bb1(%10, %cst_2, %cst_2, %cst_1 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%11: tensor<i32>, %12: tensor<1x10xf32>, %13: tensor<1x10xf32>, %14: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%15 = flow.dispatch.workgroups[%c1, %c1, %c1](%11, %8) : (tensor<i32>, tensor<i32>) -> tensor<i1> =
(%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i1>) {
%c5_i32 = arith.constant 5 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<i1>
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %36 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i1>) {
^bb0(%arg5: i32, %arg6: i32, %arg7: i1):
%39 = arith.subi %c5_i32, %arg6 : i32
%40 = arith.cmpi slt, %arg5, %39 : i32
linalg.yield %40 : i1
} -> tensor<i1>
flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:i1>
flow.return
}
%16 = flow.tensor.load %15 : tensor<i1>
cf.cond_br %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
%17 = flow.dispatch.workgroups[%c1, %c1, %c1](%11) : (tensor<i32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
%c1_i32 = arith.constant 1 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg4: i32, %arg5: i32):
%38 = arith.addi %arg4, %c1_i32 : i32
linalg.yield %38 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %37, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%18 = flow.dispatch.workgroups[%c1, %c1, %c1](%4, %11) : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<f32> =
(%arg2: !flow.dispatch.tensor<readonly:5x1x1xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
%c0_3 = arith.constant 0 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x1xf32> -> tensor<5x1x1xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<f32>
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36 : tensor<i32>) outs(%37 : tensor<f32>) {
^bb0(%arg5: i32, %arg6: f32):
%39 = arith.index_cast %arg5 : i32 to index
%40 = tensor.extract %35[%39, %c0_3, %c0_3] : tensor<5x1x1xf32>
linalg.yield %40 : f32
} -> tensor<f32>
flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
flow.return
}
%19 = flow.dispatch.workgroups[%c64, %c1, %c1](%3, %11) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<64xf32> =
(%arg2: !flow.dispatch.tensor<readonly:5x1x64xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:64xf32>) {
%c0_3 = arith.constant 0 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x64xf32> -> tensor<5x1x64xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [64] : tensor<64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%36 : tensor<i32>) outs(%37 : tensor<64xf32>) {
^bb0(%arg5: i32, %arg6: f32):
%39 = arith.index_cast %arg5 : i32 to index
%40 = linalg.index 0 : index
%41 = tensor.extract %35[%39, %c0_3, %40] : tensor<5x1x64xf32>
linalg.yield %41 : f32
} -> tensor<64xf32>
flow.dispatch.tensor.store %38, %arg4, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
flow.return
}
%20 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%21 = flow.tensor.reshape %19 : tensor<64xf32> -> tensor<1x64xf32>
%22 = flow.tensor.update %21, %20[%c0, %c0] : tensor<1x64xf32> -> %20 as tensor<1x74xf32>
%23 = flow.tensor.update %13, %22[%c0, %c64] : tensor<1x10xf32> -> %22 as tensor<1x74xf32>
%24 = flow.dispatch.workgroups[%c40, %c1, %c1](%23) : (tensor<1x74xf32>) -> tensor<1x40xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x74xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x40xf32>) {
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 74], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x74xf32> -> tensor<1x74xf32>
%36 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%37 = linalg.fill ins(%cst_3 : f32) outs(%36 : tensor<1x40xf32>) -> tensor<1x40xf32>
%38 = linalg.matmul ins(%35, %cst_4 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%38 : tensor<1x40xf32>) outs(%36 : tensor<1x40xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%40 = arith.addf %arg4, %cst_3 : f32
linalg.yield %40 : f32
} -> tensor<1x40xf32>
flow.dispatch.tensor.store %39, %arg3, offsets = [0, 0], sizes = [1, 40], strides = [1, 1] : tensor<1x40xf32> -> !flow.dispatch.tensor<writeonly:1x40xf32>
flow.return
}
%25 = flow.tensor.slice %24[%c0, %c20 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%26 = flow.tensor.slice %24[%c0, %c10 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%27 = flow.tensor.slice %24[%c0, %c0 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%28 = flow.dispatch.workgroups[%c10, %c1, %c1](%25, %12, %26, %27) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x10xf32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
%cst_3 = arith.constant 5.000000e-01 : f32
%cst_4 = arith.constant 1.000000e+01 : f32
%cst_5 = arith.constant -1.000000e+01 : f32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
%41 = math.tanh %arg10 : f32
%42 = arith.mulf %arg7, %cst_3 : f32
%43 = math.tanh %42 : f32
%44 = arith.mulf %43, %cst_3 : f32
%45 = arith.addf %44, %cst_3 : f32
%46 = arith.mulf %arg9, %cst_3 : f32
%47 = math.tanh %46 : f32
%48 = arith.mulf %47, %cst_3 : f32
%49 = arith.addf %48, %cst_3 : f32
%50 = arith.mulf %49, %41 : f32
%51 = arith.mulf %45, %arg8 : f32
%52 = arith.addf %51, %50 : f32
%53 = arith.minf %52, %cst_4 : f32
%54 = arith.maxf %53, %cst_5 : f32
linalg.yield %54 : f32
} -> tensor<1x10xf32>
flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
flow.return
}
%29 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %12, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
(%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<writeonly:1x10xf32>) {
%cst_3 = arith.constant 0.000000e+00 : f32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%38 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%38 : tensor<1x10xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32, %arg9: f32):
%40 = arith.cmpf ogt, %arg6, %cst_3 : f32
%41 = arith.select %40, %arg7, %arg8 : f32
linalg.yield %41 : f32
} -> tensor<1x10xf32>
flow.dispatch.tensor.store %39, %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
flow.return
}
%30 = flow.tensor.slice %24[%c0, %c30 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%31 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %13, %30, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
(%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
%cst_3 = arith.constant 5.000000e-01 : f32
%cst_4 = arith.constant 0.000000e+00 : f32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
%41 = math.tanh %arg10 : f32
%42 = arith.mulf %arg9, %cst_3 : f32
%43 = math.tanh %42 : f32
%44 = arith.mulf %43, %cst_3 : f32
%45 = arith.addf %44, %cst_3 : f32
%46 = arith.mulf %45, %41 : f32
%47 = arith.cmpf ogt, %arg7, %cst_4 : f32
%48 = arith.select %47, %arg8, %46 : f32
linalg.yield %48 : f32
} -> tensor<1x10xf32>
flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
flow.return
}
%32 = flow.tensor.reshape %31 : tensor<1x10xf32> -> tensor<10xf32>
%33 = flow.tensor.load %11 : tensor<i32>
%34 = flow.dispatch.workgroups[%c10, %c1, %c1](%32, %14, %33) : (tensor<10xf32>, tensor<5x1x10xf32>, i32) -> %14 =
(%arg2: !flow.dispatch.tensor<readonly:10xf32>, %arg3: !flow.dispatch.tensor<readwrite:5x1x10xf32>, %arg4: i32) {
%c1_3 = arith.constant 1 : index
%c0_4 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c4_i32 = arith.constant 4 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:10xf32> -> tensor<10xf32>
%36 = arith.maxsi %arg4, %c0_i32 : i32
%37 = arith.minsi %36, %c4_i32 : i32
%38 = arith.index_cast %37 : i32 to index
flow.dispatch.tensor.store %35, %arg3, offsets = [%38, %c0_4, %c0_4], sizes = [1, 1, 10], strides = [%c1_3, %c1_3, %c1_3] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:5x1x10xf32>
flow.return
}
cf.br ^bb1(%17, %29, %31, %34 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %14 : tensor<5x1x10xf32>
}
// -----// IR Dump After CaptureDispatchDynamicDims //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c20 = arith.constant 20 : index
%c0 = arith.constant 0 : index
%c30 = arith.constant 30 : index
%c64 = arith.constant 64 : index
%c40 = arith.constant 40 : index
%c10 = arith.constant 10 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = flow.tensor.splat %cst : tensor<1x5x64xf32>
%1 = flow.tensor.reshape %arg1 : tensor<1x5x2x2xf32> -> tensor<5x4xf32>
%2 = flow.dispatch.workgroups[%c4, %c5, %c1](%1, %0) : (tensor<5x4xf32>, tensor<1x5x64xf32>) -> %0 =
(%arg2: !flow.dispatch.tensor<readonly:5x4xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5x64xf32>) {
%c0_3 = arith.constant 0 : index
%c1_4 = arith.constant 1 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [5, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x4xf32> -> tensor<5x4xf32>
flow.dispatch.tensor.store %35, %arg3, offsets = [%c0_3, %c0_3, %c0_3], sizes = [1, 5, 4], strides = [%c1_4, %c1_4, %c1_4] : tensor<5x4xf32> -> !flow.dispatch.tensor<readwrite:1x5x64xf32>
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<1x5x64xf32> -> tensor<5x1x64xf32>
%4 = flow.tensor.reshape %arg0 : tensor<1x5xf32> -> tensor<5x1x1xf32>
%5 = flow.tensor.splat %cst_0 : tensor<1x5xf32>
%6 = flow.dispatch.workgroups[%c5, %c1, %c1](%arg0, %5) : (tensor<1x5xf32>, tensor<1x5xf32>) -> %5 =
(%arg2: !flow.dispatch.tensor<readonly:1x5xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5xf32>) {
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x5xf32> -> tensor<1x5xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x5xf32> -> tensor<1x5xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<1x5xf32>) outs(%36 : tensor<1x5xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%38 = arith.minf %arg4, %arg5 : f32
linalg.yield %38 : f32
} -> tensor<1x5xf32>
flow.dispatch.tensor.store %37, %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : tensor<1x5xf32> -> !flow.dispatch.tensor<readwrite:1x5xf32>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<1x5xf32> -> tensor<5xf32>
%8 = flow.dispatch.workgroups[%c1, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%c5_i32 = arith.constant 5 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%36 : tensor<i32>) -> tensor<i32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%37 : tensor<i32>) {
^bb0(%arg4: f32, %arg5: i32, %arg6: i32):
%40 = arith.cmpf oeq, %arg4, %cst_3 : f32
%41 = arith.extui %40 : i1 to i32
%42 = arith.muli %41, %arg5 : i32
%43 = arith.maxsi %42, %arg6 : i32
linalg.yield %43 : i32
} -> tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%38 : tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg4: i32, %arg5: i32):
%40 = arith.subi %c5_i32, %arg4 : i32
linalg.yield %40 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %39, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%9 = flow.dispatch.workgroups[%c5, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<5xf32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:5xf32>) {
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = linalg.init_tensor [5] : tensor<5xf32>
%37 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%35 : tensor<5xf32>) outs(%36 : tensor<5xf32>) : tensor<5xf32>
flow.dispatch.tensor.store %37, %arg3, offsets = [0], sizes = [5], strides = [1] : tensor<5xf32> -> !flow.dispatch.tensor<writeonly:5xf32>
flow.return
}
%10 = flow.dispatch.workgroups[%c1, %c1, %c1](%9, %8) : (tensor<5xf32>, tensor<i32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i32>) {
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<i32>
%38 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%37 : tensor<i32>) -> tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%38 : tensor<i32>) {
^bb0(%arg5: f32, %arg6: i32, %arg7: i32):
%41 = arith.cmpf oeq, %arg5, %cst_3 : f32
%42 = arith.extui %41 : i1 to i32
%43 = arith.muli %42, %arg6 : i32
%44 = arith.maxsi %43, %arg7 : i32
linalg.yield %44 : i32
} -> tensor<i32>
%40 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36, %39 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i32>) {
^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
%41 = arith.subi %c5_i32, %arg6 : i32
%42 = arith.cmpi eq, %arg5, %c5_i32 : i32
%43 = arith.select %42, %c0_i32, %41 : i32
linalg.yield %43 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %40, %arg4, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
cf.br ^bb1(%10, %cst_2, %cst_2, %cst_1 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%11: tensor<i32>, %12: tensor<1x10xf32>, %13: tensor<1x10xf32>, %14: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%15 = flow.dispatch.workgroups[%c1, %c1, %c1](%11, %8) : (tensor<i32>, tensor<i32>) -> tensor<i1> =
(%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i1>) {
%c5_i32 = arith.constant 5 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<i1>
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %36 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i1>) {
^bb0(%arg5: i32, %arg6: i32, %arg7: i1):
%39 = arith.subi %c5_i32, %arg6 : i32
%40 = arith.cmpi slt, %arg5, %39 : i32
linalg.yield %40 : i1
} -> tensor<i1>
flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:i1>
flow.return
}
%16 = flow.tensor.load %15 : tensor<i1>
cf.cond_br %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
%17 = flow.dispatch.workgroups[%c1, %c1, %c1](%11) : (tensor<i32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
%c1_i32 = arith.constant 1 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg4: i32, %arg5: i32):
%38 = arith.addi %arg4, %c1_i32 : i32
linalg.yield %38 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %37, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%18 = flow.dispatch.workgroups[%c1, %c1, %c1](%4, %11) : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<f32> =
(%arg2: !flow.dispatch.tensor<readonly:5x1x1xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
%c0_3 = arith.constant 0 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x1xf32> -> tensor<5x1x1xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<f32>
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36 : tensor<i32>) outs(%37 : tensor<f32>) {
^bb0(%arg5: i32, %arg6: f32):
%39 = arith.index_cast %arg5 : i32 to index
%40 = tensor.extract %35[%39, %c0_3, %c0_3] : tensor<5x1x1xf32>
linalg.yield %40 : f32
} -> tensor<f32>
flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
flow.return
}
%19 = flow.dispatch.workgroups[%c64, %c1, %c1](%3, %11) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<64xf32> =
(%arg2: !flow.dispatch.tensor<readonly:5x1x64xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:64xf32>) {
%c0_3 = arith.constant 0 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x64xf32> -> tensor<5x1x64xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [64] : tensor<64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%36 : tensor<i32>) outs(%37 : tensor<64xf32>) {
^bb0(%arg5: i32, %arg6: f32):
%39 = arith.index_cast %arg5 : i32 to index
%40 = linalg.index 0 : index
%41 = tensor.extract %35[%39, %c0_3, %40] : tensor<5x1x64xf32>
linalg.yield %41 : f32
} -> tensor<64xf32>
flow.dispatch.tensor.store %38, %arg4, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
flow.return
}
%20 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%21 = flow.tensor.reshape %19 : tensor<64xf32> -> tensor<1x64xf32>
%22 = flow.tensor.update %21, %20[%c0, %c0] : tensor<1x64xf32> -> %20 as tensor<1x74xf32>
%23 = flow.tensor.update %13, %22[%c0, %c64] : tensor<1x10xf32> -> %22 as tensor<1x74xf32>
%24 = flow.dispatch.workgroups[%c40, %c1, %c1](%23) : (tensor<1x74xf32>) -> tensor<1x40xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x74xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x40xf32>) {
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 74], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x74xf32> -> tensor<1x74xf32>
%36 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%37 = linalg.fill ins(%cst_3 : f32) outs(%36 : tensor<1x40xf32>) -> tensor<1x40xf32>
%38 = linalg.matmul ins(%35, %cst_4 : tensor<1x74xf32>, tensor<74x40xf32>) outs(%37 : tensor<1x40xf32>) -> tensor<1x40xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%38 : tensor<1x40xf32>) outs(%36 : tensor<1x40xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%40 = arith.addf %arg4, %cst_3 : f32
linalg.yield %40 : f32
} -> tensor<1x40xf32>
flow.dispatch.tensor.store %39, %arg3, offsets = [0, 0], sizes = [1, 40], strides = [1, 1] : tensor<1x40xf32> -> !flow.dispatch.tensor<writeonly:1x40xf32>
flow.return
}
%25 = flow.tensor.slice %24[%c0, %c20 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%26 = flow.tensor.slice %24[%c0, %c10 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%27 = flow.tensor.slice %24[%c0, %c0 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%28 = flow.dispatch.workgroups[%c10, %c1, %c1](%25, %12, %26, %27) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x10xf32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
%cst_3 = arith.constant 5.000000e-01 : f32
%cst_4 = arith.constant 1.000000e+01 : f32
%cst_5 = arith.constant -1.000000e+01 : f32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
%41 = math.tanh %arg10 : f32
%42 = arith.mulf %arg7, %cst_3 : f32
%43 = math.tanh %42 : f32
%44 = arith.mulf %43, %cst_3 : f32
%45 = arith.addf %44, %cst_3 : f32
%46 = arith.mulf %arg9, %cst_3 : f32
%47 = math.tanh %46 : f32
%48 = arith.mulf %47, %cst_3 : f32
%49 = arith.addf %48, %cst_3 : f32
%50 = arith.mulf %49, %41 : f32
%51 = arith.mulf %45, %arg8 : f32
%52 = arith.addf %51, %50 : f32
%53 = arith.minf %52, %cst_4 : f32
%54 = arith.maxf %53, %cst_5 : f32
linalg.yield %54 : f32
} -> tensor<1x10xf32>
flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
flow.return
}
%29 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %12, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
(%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<writeonly:1x10xf32>) {
%cst_3 = arith.constant 0.000000e+00 : f32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%38 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%38 : tensor<1x10xf32>) {
^bb0(%arg6: f32, %arg7: f32, %arg8: f32, %arg9: f32):
%40 = arith.cmpf ogt, %arg6, %cst_3 : f32
%41 = arith.select %40, %arg7, %arg8 : f32
linalg.yield %41 : f32
} -> tensor<1x10xf32>
flow.dispatch.tensor.store %39, %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
flow.return
}
%30 = flow.tensor.slice %24[%c0, %c30 for %c1, %c10] : tensor<1x40xf32> -> tensor<1x10xf32>
%31 = flow.dispatch.workgroups[%c10, %c1, %c1](%18, %13, %30, %28) : (tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> =
(%arg2: !flow.dispatch.tensor<readonly:f32>, %arg3: !flow.dispatch.tensor<readonly:1x10xf32>, %arg4: !flow.dispatch.tensor<readonly:1x10xf32>, %arg5: !flow.dispatch.tensor<readonly:1x10xf32>, %arg6: !flow.dispatch.tensor<writeonly:1x10xf32>) {
%cst_3 = arith.constant 5.000000e-01 : f32
%cst_4 = arith.constant 0.000000e+00 : f32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%37 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%38 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x10xf32> -> tensor<1x10xf32>
%39 = linalg.init_tensor [1, 10] : tensor<1x10xf32>
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35, %36, %37, %38 : tensor<f32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) outs(%39 : tensor<1x10xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32):
%41 = math.tanh %arg10 : f32
%42 = arith.mulf %arg9, %cst_3 : f32
%43 = math.tanh %42 : f32
%44 = arith.mulf %43, %cst_3 : f32
%45 = arith.addf %44, %cst_3 : f32
%46 = arith.mulf %45, %41 : f32
%47 = arith.cmpf ogt, %arg7, %cst_4 : f32
%48 = arith.select %47, %arg8, %46 : f32
linalg.yield %48 : f32
} -> tensor<1x10xf32>
flow.dispatch.tensor.store %40, %arg6, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor<writeonly:1x10xf32>
flow.return
}
%32 = flow.tensor.reshape %31 : tensor<1x10xf32> -> tensor<10xf32>
%33 = flow.tensor.load %11 : tensor<i32>
%34 = flow.dispatch.workgroups[%c10, %c1, %c1](%32, %14, %33) : (tensor<10xf32>, tensor<5x1x10xf32>, i32) -> %14 =
(%arg2: !flow.dispatch.tensor<readonly:10xf32>, %arg3: !flow.dispatch.tensor<readwrite:5x1x10xf32>, %arg4: i32) {
%c1_3 = arith.constant 1 : index
%c0_4 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c4_i32 = arith.constant 4 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:10xf32> -> tensor<10xf32>
%36 = arith.maxsi %arg4, %c0_i32 : i32
%37 = arith.minsi %36, %c4_i32 : i32
%38 = arith.index_cast %37 : i32 to index
flow.dispatch.tensor.store %35, %arg3, offsets = [%38, %c0_4, %c0_4], sizes = [1, 1, 10], strides = [%c1_3, %c1_3, %c1_3] : tensor<10xf32> -> !flow.dispatch.tensor<readwrite:5x1x10xf32>
flow.return
}
cf.br ^bb1(%17, %29, %31, %34 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb3: // pred: ^bb1
return %14 : tensor<5x1x10xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func.func private @_main(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5x2x2xf32>) -> tensor<5x1x10xf32> {
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c20 = arith.constant 20 : index
%c0 = arith.constant 0 : index
%c30 = arith.constant 30 : index
%c64 = arith.constant 64 : index
%c40 = arith.constant 40 : index
%c10 = arith.constant 10 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 0x7F800000 : f32
%cst_1 = arith.constant dense<0.000000e+00> : tensor<5x1x10xf32>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<1x10xf32>
%0 = flow.tensor.splat %cst : tensor<1x5x64xf32>
%1 = flow.tensor.reshape %arg1 : tensor<1x5x2x2xf32> -> tensor<5x4xf32>
%2 = flow.dispatch.workgroups[%c4, %c5, %c1](%1, %0) : (tensor<5x4xf32>, tensor<1x5x64xf32>) -> %0 =
(%arg2: !flow.dispatch.tensor<readonly:5x4xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5x64xf32>) {
%c0_3 = arith.constant 0 : index
%c1_4 = arith.constant 1 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [5, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x4xf32> -> tensor<5x4xf32>
flow.dispatch.tensor.store %35, %arg3, offsets = [%c0_3, %c0_3, %c0_3], sizes = [1, 5, 4], strides = [%c1_4, %c1_4, %c1_4] : tensor<5x4xf32> -> !flow.dispatch.tensor<readwrite:1x5x64xf32>
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<1x5x64xf32> -> tensor<5x1x64xf32>
%4 = flow.tensor.reshape %arg0 : tensor<1x5xf32> -> tensor<5x1x1xf32>
%5 = flow.tensor.splat %cst_0 : tensor<1x5xf32>
%6 = flow.dispatch.workgroups[%c5, %c1, %c1](%arg0, %5) : (tensor<1x5xf32>, tensor<1x5xf32>) -> %5 =
(%arg2: !flow.dispatch.tensor<readonly:1x5xf32>, %arg3: !flow.dispatch.tensor<readwrite:1x5xf32>) {
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x5xf32> -> tensor<1x5xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x5xf32> -> tensor<1x5xf32>
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%35 : tensor<1x5xf32>) outs(%36 : tensor<1x5xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%38 = arith.minf %arg4, %arg5 : f32
linalg.yield %38 : f32
} -> tensor<1x5xf32>
flow.dispatch.tensor.store %37, %arg3, offsets = [0, 0], sizes = [1, 5], strides = [1, 1] : tensor<1x5xf32> -> !flow.dispatch.tensor<readwrite:1x5xf32>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<1x5xf32> -> tensor<5xf32>
%8 = flow.dispatch.workgroups[%c1, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%c5_i32 = arith.constant 5 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%36 : tensor<i32>) -> tensor<i32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%37 : tensor<i32>) {
^bb0(%arg4: f32, %arg5: i32, %arg6: i32):
%40 = arith.cmpf oeq, %arg4, %cst_3 : f32
%41 = arith.extui %40 : i1 to i32
%42 = arith.muli %41, %arg5 : i32
%43 = arith.maxsi %42, %arg6 : i32
linalg.yield %43 : i32
} -> tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%38 : tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg4: i32, %arg5: i32):
%40 = arith.subi %c5_i32, %arg4 : i32
linalg.yield %40 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %39, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%9 = flow.dispatch.workgroups[%c5, %c1, %c1](%7) : (tensor<5xf32>) -> tensor<5xf32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<writeonly:5xf32>) {
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = linalg.init_tensor [5] : tensor<5xf32>
%37 = iree_linalg_ext.reverse dimensions(dense<0> : tensor<1xi64>) ins(%35 : tensor<5xf32>) outs(%36 : tensor<5xf32>) : tensor<5xf32>
flow.dispatch.tensor.store %37, %arg3, offsets = [0], sizes = [5], strides = [1] : tensor<5xf32> -> !flow.dispatch.tensor<writeonly:5xf32>
flow.return
}
%10 = flow.dispatch.workgroups[%c1, %c1, %c1](%9, %8) : (tensor<5xf32>, tensor<i32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:5xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i32>) {
%c-2147483648_i32 = arith.constant -2147483648 : i32
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%c5_i32 = arith.constant 5 : i32
%c0_i32 = arith.constant 0 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<i32>
%38 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%37 : tensor<i32>) -> tensor<i32>
%39 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%35, %cst_4 : tensor<5xf32>, tensor<5xi32>) outs(%38 : tensor<i32>) {
^bb0(%arg5: f32, %arg6: i32, %arg7: i32):
%41 = arith.cmpf oeq, %arg5, %cst_3 : f32
%42 = arith.extui %41 : i1 to i32
%43 = arith.muli %42, %arg6 : i32
%44 = arith.maxsi %43, %arg7 : i32
linalg.yield %44 : i32
} -> tensor<i32>
%40 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36, %39 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i32>) {
^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
%41 = arith.subi %c5_i32, %arg6 : i32
%42 = arith.cmpi eq, %arg5, %c5_i32 : i32
%43 = arith.select %42, %c0_i32, %41 : i32
linalg.yield %43 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %40, %arg4, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
cf.br ^bb1(%10, %cst_2, %cst_2, %cst_1 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>)
^bb1(%11: tensor<i32>, %12: tensor<1x10xf32>, %13: tensor<1x10xf32>, %14: tensor<5x1x10xf32>): // 2 preds: ^bb0, ^bb2
%15 = flow.dispatch.workgroups[%c1, %c1, %c1](%11, %8) : (tensor<i32>, tensor<i32>) -> tensor<i1> =
(%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:i1>) {
%c5_i32 = arith.constant 5 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<i1>
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35, %36 : tensor<i32>, tensor<i32>) outs(%37 : tensor<i1>) {
^bb0(%arg5: i32, %arg6: i32, %arg7: i1):
%39 = arith.subi %c5_i32, %arg6 : i32
%40 = arith.cmpi slt, %arg5, %39 : i32
linalg.yield %40 : i1
} -> tensor<i1>
flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:i1>
flow.return
}
%16 = flow.tensor.load %15 : tensor<i1>
cf.cond_br %16, ^bb2, ^bb3
^bb2: // pred: ^bb1
%17 = flow.dispatch.workgroups[%c1, %c1, %c1](%11) : (tensor<i32>) -> tensor<i32> =
(%arg2: !flow.dispatch.tensor<readonly:i32>, %arg3: !flow.dispatch.tensor<writeonly:i32>) {
%c1_i32 = arith.constant 1 : i32
%35 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%36 = linalg.init_tensor [] : tensor<i32>
%37 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%35 : tensor<i32>) outs(%36 : tensor<i32>) {
^bb0(%arg4: i32, %arg5: i32):
%38 = arith.addi %arg4, %c1_i32 : i32
linalg.yield %38 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %37, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
}
%18 = flow.dispatch.workgroups[%c1, %c1, %c1](%4, %11) : (tensor<5x1x1xf32>, tensor<i32>) -> tensor<f32> =
(%arg2: !flow.dispatch.tensor<readonly:5x1x1xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
%c0_3 = arith.constant 0 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x1xf32> -> tensor<5x1x1xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [] : tensor<f32>
%38 = linalg.generic {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = []} ins(%36 : tensor<i32>) outs(%37 : tensor<f32>) {
^bb0(%arg5: i32, %arg6: f32):
%39 = arith.index_cast %arg5 : i32 to index
%40 = tensor.extract %35[%39, %c0_3, %c0_3] : tensor<5x1x1xf32>
linalg.yield %40 : f32
} -> tensor<f32>
flow.dispatch.tensor.store %38, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
flow.return
}
%19 = flow.dispatch.workgroups[%c64, %c1, %c1](%3, %11) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<64xf32> =
(%arg2: !flow.dispatch.tensor<readonly:5x1x64xf32>, %arg3: !flow.dispatch.tensor<readonly:i32>, %arg4: !flow.dispatch.tensor<writeonly:64xf32>) {
%c0_3 = arith.constant 0 : index
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0], sizes = [5, 1, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x1x64xf32> -> tensor<5x1x64xf32>
%36 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%37 = linalg.init_tensor [64] : tensor<64xf32>
%38 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%36 : tensor<i32>) outs(%37 : tensor<64xf32>) {
^bb0(%arg5: i32, %arg6: f32):
%39 = arith.index_cast %arg5 : i32 to index
%40 = linalg.index 0 : index
%41 = tensor.extract %35[%39, %c0_3, %40] : tensor<5x1x64xf32>
linalg.yield %41 : f32
} -> tensor<64xf32>
flow.dispatch.tensor.store %38, %arg4, offsets = [0], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:64xf32>
flow.return
}
%20 = linalg.init_tensor [1, 74] : tensor<1x74xf32>
%21 = flow.tensor.reshape %19 : tensor<64xf32> -> tensor<1x64xf32>
%22 = flow.tensor.update %21, %20[%c0, %c0] : tensor<1x64xf32> -> %20 as tensor<1x74xf32>
%23 = flow.tensor.update %13, %22[%c0, %c64] : tensor<1x10xf32> -> %22 as tensor<1x74xf32>
%24 = flow.dispatch.workgroups[%c40, %c1, %c1](%23) : (tensor<1x74xf32>) -> tensor<1x40xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x74xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x40xf32>) {
%cst_3 = arith.constant 0.000000e+00 : f32
%cst_4 = arith.constant dense<4.200000e-01> : tensor<74x40xf32>
%35 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [1, 74], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x74xf32> -> tensor<1x74xf32>
%36 = linalg.init_tensor [1, 40] : tensor<1x40xf32>
%37 = linalg.fill ins(%cst_3 :
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment