Created
June 24, 2024 22:15
-
-
Save AmosLewis/33187568cf09d47b3d9a75e6ff252ca8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0, d1, d2, d3) -> (d1)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | |
#map3 = affine_map<(d0, d1, d2) -> (d0, d2, d1)> | |
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, 0)> | |
#map5 = affine_map<(d0, d1, d2) -> (0, d1, 0)> | |
#map6 = affine_map<(d0, d1, d2) -> (0, d1, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map9 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)> | |
#map12 = affine_map<(d0, d1, d2, d3) -> ()> | |
#map13 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> | |
#map14 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, 0)> | |
#map15 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, 0)> | |
#map16 = affine_map<(d0, d1, d2) -> ()> | |
#map17 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2, d1)> | |
#map18 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)> | |
#map19 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, 0)> | |
#map20 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map21 = affine_map<(d0, d1) -> (0, d1)> | |
#map22 = affine_map<(d0, d1) -> (d0, d1)> | |
#map23 = affine_map<(d0, d1) -> (d1, d0)> | |
#map24 = affine_map<(d0, d1) -> (d1)> | |
module { | |
ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64> | |
func.func @main_graph(%arg0: tensor<1x3x512x512xf32>) -> tensor<1x1000xf32> { | |
%cst = arith.constant dense_resource<__elided__> : tensor<1024x256xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0_i64 = arith.constant 0 : i64 | |
%cst_1 = arith.constant 0xFF800000 : f32 | |
%cst_2 = arith.constant 3.200000e+01 : f32 | |
%cst_3 = arith.constant 9.99999974E-6 : f32 | |
%cst_4 = arith.constant 6.400000e+01 : f32 | |
%cst_5 = arith.constant 1.600000e+02 : f32 | |
%cst_6 = arith.constant 2.560000e+02 : f32 | |
%cst_7 = arith.constant dense_resource<__elided__> : tensor<256x1024xf32> | |
%cst_8 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_9 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_10 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_11 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_12 = arith.constant dense_resource<__elided__> : tensor<1024x256xf32> | |
%cst_13 = arith.constant dense_resource<__elided__> : tensor<256x1024xf32> | |
%cst_14 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_15 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_16 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_17 = arith.constant dense_resource<__elided__> : tensor<256x256xf32> | |
%cst_18 = arith.constant dense_resource<__elided__> : tensor<640x160xf32> | |
%cst_19 = arith.constant dense_resource<__elided__> : tensor<160x640xf32> | |
%cst_20 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_21 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_22 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_23 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_24 = arith.constant dense_resource<__elided__> : tensor<640x160xf32> | |
%cst_25 = arith.constant dense_resource<__elided__> : tensor<160x640xf32> | |
%cst_26 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_27 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_28 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_29 = arith.constant dense_resource<__elided__> : tensor<160x160xf32> | |
%cst_30 = arith.constant dense_resource<__elided__> : tensor<256x64xf32> | |
%cst_31 = arith.constant dense_resource<__elided__> : tensor<64x256xf32> | |
%cst_32 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_33 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_34 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_35 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_36 = arith.constant dense_resource<__elided__> : tensor<256x64xf32> | |
%cst_37 = arith.constant dense_resource<__elided__> : tensor<64x256xf32> | |
%cst_38 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_39 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_40 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_41 = arith.constant dense_resource<__elided__> : tensor<64x64xf32> | |
%cst_42 = arith.constant dense_resource<__elided__> : tensor<128x32xf32> | |
%cst_43 = arith.constant dense_resource<__elided__> : tensor<32x128xf32> | |
%cst_44 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_45 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_46 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_47 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_48 = arith.constant dense_resource<__elided__> : tensor<128x32xf32> | |
%cst_49 = arith.constant dense_resource<__elided__> : tensor<32x128xf32> | |
%cst_50 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_51 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_52 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_53 = arith.constant dense_resource<__elided__> : tensor<32x32xf32> | |
%cst_54 = arith.constant dense_resource<__elided__> : tensor<1000xf32> | |
%cst_55 = arith.constant dense_resource<__elided__> : tensor<1000x256xf32> | |
%cst_56 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_57 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_58 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_59 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_60 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_61 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_62 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_63 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_64 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_65 = arith.constant dense_resource<__elided__> : tensor<1024xf32> | |
%cst_66 = arith.constant dense_resource<__elided__> : tensor<1024x1x3x3xf32> | |
%cst_67 = arith.constant dense_resource<__elided__> : tensor<1024xf32> | |
%cst_68 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_69 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_70 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_71 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_72 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_73 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_74 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_75 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_76 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_77 = arith.constant dense_resource<__elided__> : tensor<1024xf32> | |
%cst_78 = arith.constant dense_resource<__elided__> : tensor<1024x1x3x3xf32> | |
%cst_79 = arith.constant dense_resource<__elided__> : tensor<1024xf32> | |
%cst_80 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_81 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_82 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_83 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_84 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_85 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_86 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_87 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_88 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_89 = arith.constant dense_resource<__elided__> : tensor<640xf32> | |
%cst_90 = arith.constant dense_resource<__elided__> : tensor<640x1x3x3xf32> | |
%cst_91 = arith.constant dense_resource<__elided__> : tensor<640xf32> | |
%cst_92 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_93 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_94 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_95 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_96 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_97 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_98 = arith.constant dense_resource<__elided__> : tensor<160x160x2x2xf32> | |
%cst_99 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_100 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_101 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_102 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_103 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_104 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_105 = arith.constant dense_resource<__elided__> : tensor<640xf32> | |
%cst_106 = arith.constant dense_resource<__elided__> : tensor<640x1x3x3xf32> | |
%cst_107 = arith.constant dense_resource<__elided__> : tensor<640xf32> | |
%cst_108 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_109 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_110 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_111 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_112 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_113 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_114 = arith.constant dense_resource<__elided__> : tensor<160x160x2x2xf32> | |
%cst_115 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_116 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_117 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_118 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_119 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_120 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_121 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_122 = arith.constant dense_resource<__elided__> : tensor<256x1x3x3xf32> | |
%cst_123 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_124 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_125 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_126 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_127 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_128 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_129 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_130 = arith.constant dense_resource<__elided__> : tensor<64x64x4x4xf32> | |
%cst_131 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_132 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_133 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_134 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_135 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_136 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_137 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_138 = arith.constant dense_resource<__elided__> : tensor<256x1x3x3xf32> | |
%cst_139 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_140 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_141 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_142 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_143 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_144 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_145 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_146 = arith.constant dense_resource<__elided__> : tensor<64x64x4x4xf32> | |
%cst_147 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_148 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_149 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_150 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_151 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_152 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_153 = arith.constant dense_resource<__elided__> : tensor<128xf32> | |
%cst_154 = arith.constant dense_resource<__elided__> : tensor<128x1x3x3xf32> | |
%cst_155 = arith.constant dense_resource<__elided__> : tensor<128xf32> | |
%cst_156 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_157 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_158 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_159 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_160 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_161 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_162 = arith.constant dense_resource<__elided__> : tensor<32x32x8x8xf32> | |
%cst_163 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_164 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_165 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_166 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_167 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_168 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_169 = arith.constant dense_resource<__elided__> : tensor<128xf32> | |
%cst_170 = arith.constant dense_resource<__elided__> : tensor<128x1x3x3xf32> | |
%cst_171 = arith.constant dense_resource<__elided__> : tensor<128xf32> | |
%cst_172 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_173 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_174 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_175 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_176 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_177 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_178 = arith.constant dense_resource<__elided__> : tensor<32x32x8x8xf32> | |
%cst_179 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_180 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_181 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_182 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_183 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_184 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_185 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_186 = arith.constant dense_resource<__elided__> : tensor<256xf32> | |
%cst_187 = arith.constant dense_resource<__elided__> : tensor<256x160x3x3xf32> | |
%cst_188 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_189 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_190 = arith.constant dense_resource<__elided__> : tensor<160xf32> | |
%cst_191 = arith.constant dense_resource<__elided__> : tensor<160x64x3x3xf32> | |
%cst_192 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_193 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_194 = arith.constant dense_resource<__elided__> : tensor<64xf32> | |
%cst_195 = arith.constant dense_resource<__elided__> : tensor<64x32x3x3xf32> | |
%cst_196 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_197 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_198 = arith.constant dense_resource<__elided__> : tensor<32xf32> | |
%cst_199 = arith.constant dense_resource<__elided__> : tensor<32x3x7x7xf32> | |
%cst_200 = arith.constant dense<5.65685415> : tensor<f32> | |
%cst_201 = arith.constant dense<1.41421354> : tensor<f32> | |
%cst_202 = arith.constant dense<1.000000e+00> : tensor<f32> | |
%cst_203 = arith.constant dense<5.000000e-01> : tensor<f32> | |
%padded = tensor.pad %arg0 low[0, 0, 3, 3] high[0, 0, 3, 3] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x3x512x512xf32> to tensor<1x3x518x518xf32> | |
%0 = tensor.empty() : tensor<1x32x128x128xf32> | |
%1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_198 : tensor<32xf32>) outs(%0 : tensor<1x32x128x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x128x128xf32> | |
%2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<4> : vector<2xi64>} ins(%padded, %cst_199 : tensor<1x3x518x518xf32>, tensor<32x3x7x7xf32>) outs(%1 : tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32> | |
%collapsed = tensor.collapse_shape %2 [[0], [1], [2, 3]] : tensor<1x32x128x128xf32> into tensor<1x32x16384xf32> | |
%3 = tensor.empty() : tensor<1x16384x32xf32> | |
%4 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed : tensor<1x32x16384xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%5 = tensor.empty() : tensor<1x16384x1xf32> | |
%6 = linalg.fill ins(%cst_0 : f32) outs(%5 : tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%9 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %9 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%16 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%17 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%10, %16 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17, %cst_197 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%19 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%18, %cst_196 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%20 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%19 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%20 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%22 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%21 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%23 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%19, %22 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%24 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%23, %23 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%25 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%24 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%26 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%27 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%28 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%27 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%29 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%28 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%30 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%23, %29 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%31 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30, %cst_183 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%32 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%31, %cst_182 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%33 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%32 : tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%34 = tensor.empty() : tensor<1x32x32xf32> | |
%35 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_53 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%36 = linalg.fill ins(%cst_0 : f32) outs(%3 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%37 = linalg.batch_matmul ins(%33, %35 : tensor<1x16384x32xf32>, tensor<1x32x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%38 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_181, %37 : tensor<32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%expanded = tensor.expand_shape %38 [[0], [1], [2, 3]] output_shape [1, 16384, 1, 32] : tensor<1x16384x32xf32> into tensor<1x16384x1x32xf32> | |
%39 = tensor.empty() : tensor<1x1x16384x32xf32> | |
%40 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<1x16384x1x32xf32>) outs(%39 : tensor<1x1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x16384x32xf32> | |
%41 = tensor.empty() : tensor<1x32x16384xf32> | |
%42 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%32 : tensor<1x16384x32xf32>) outs(%41 : tensor<1x32x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x16384xf32> | |
%expanded_204 = tensor.expand_shape %42 [[0], [1], [2, 3]] output_shape [1, 32, 128, 128] : tensor<1x32x16384xf32> into tensor<1x32x128x128xf32> | |
%43 = tensor.empty() : tensor<1x32x16x16xf32> | |
%44 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_177 : tensor<32xf32>) outs(%43 : tensor<1x32x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x16x16xf32> | |
%45 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<8> : vector<2xi64>} ins(%expanded_204, %cst_178 : tensor<1x32x128x128xf32>, tensor<32x32x8x8xf32>) outs(%44 : tensor<1x32x16x16xf32>) -> tensor<1x32x16x16xf32> | |
%collapsed_205 = tensor.collapse_shape %45 [[0], [1], [2, 3]] : tensor<1x32x16x16xf32> into tensor<1x32x256xf32> | |
%46 = tensor.empty() : tensor<1x256x32xf32> | |
%47 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_205 : tensor<1x32x256xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%48 = tensor.empty() : tensor<1x256x1xf32> | |
%49 = linalg.fill ins(%cst_0 : f32) outs(%48 : tensor<1x256x1xf32>) -> tensor<1x256x1xf32> | |
%50 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%47 : tensor<1x256x32xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%51 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%50 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%52 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%51 : tensor<1x256x1xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%53 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%47, %52 : tensor<1x256x32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%54 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%53, %53 : tensor<1x256x32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%55 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%54 : tensor<1x256x32xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%56 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%55 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%57 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%56 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%58 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%57 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%59 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%58 : tensor<1x256x1xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%60 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%53, %59 : tensor<1x256x32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%61 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%60, %cst_176 : tensor<1x256x32xf32>, tensor<32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%62 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%61, %cst_175 : tensor<1x256x32xf32>, tensor<32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%63 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%62 : tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%64 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_52 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%65 = linalg.fill ins(%cst_0 : f32) outs(%46 : tensor<1x256x32xf32>) -> tensor<1x256x32xf32> | |
%66 = linalg.batch_matmul ins(%63, %64 : tensor<1x256x32xf32>, tensor<1x32x32xf32>) outs(%65 : tensor<1x256x32xf32>) -> tensor<1x256x32xf32> | |
%67 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_180, %66 : tensor<32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%expanded_206 = tensor.expand_shape %67 [[0], [1], [2, 3]] output_shape [1, 256, 1, 32] : tensor<1x256x32xf32> into tensor<1x256x1x32xf32> | |
%68 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_51 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%69 = linalg.batch_matmul ins(%63, %68 : tensor<1x256x32xf32>, tensor<1x32x32xf32>) outs(%65 : tensor<1x256x32xf32>) -> tensor<1x256x32xf32> | |
%70 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_179, %69 : tensor<32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%expanded_207 = tensor.expand_shape %70 [[0], [1], [2, 3]] output_shape [1, 256, 1, 32] : tensor<1x256x32xf32> into tensor<1x256x1x32xf32> | |
%71 = tensor.empty() : tensor<1x1x256x32xf32> | |
%72 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_207 : tensor<1x256x1x32xf32>) outs(%71 : tensor<1x1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x256x32xf32> | |
%73 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_206 : tensor<1x256x1x32xf32>) outs(%71 : tensor<1x1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x256x32xf32> | |
%74 = tensor.empty() : tensor<1x1x32x256xf32> | |
%75 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%73 : tensor<1x1x256x32xf32>) outs(%74 : tensor<1x1x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x32x256xf32> | |
%76 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%40 : tensor<1x1x16384x32xf32>) outs(%39 : tensor<1x1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x16384x32xf32> | |
%77 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%75 : tensor<1x1x32x256xf32>) outs(%74 : tensor<1x1x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x32x256xf32> | |
%collapsed_208 = tensor.collapse_shape %76 [[0, 1], [2], [3]] : tensor<1x1x16384x32xf32> into tensor<1x16384x32xf32> | |
%collapsed_209 = tensor.collapse_shape %77 [[0, 1], [2], [3]] : tensor<1x1x32x256xf32> into tensor<1x32x256xf32> | |
%78 = tensor.empty() : tensor<1x16384x256xf32> | |
%79 = linalg.fill ins(%cst_0 : f32) outs(%78 : tensor<1x16384x256xf32>) -> tensor<1x16384x256xf32> | |
%80 = linalg.batch_matmul ins(%collapsed_208, %collapsed_209 : tensor<1x16384x32xf32>, tensor<1x32x256xf32>) outs(%79 : tensor<1x16384x256xf32>) -> tensor<1x16384x256xf32> | |
%expanded_210 = tensor.expand_shape %80 [[0, 1], [2], [3]] output_shape [1, 1, 16384, 256] : tensor<1x16384x256xf32> into tensor<1x1x16384x256xf32> | |
%81 = tensor.empty() : tensor<1x1x16384x256xf32> | |
%82 = linalg.generic {indexing_maps = [#map11, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_210, %cst_200 : tensor<1x1x16384x256xf32>, tensor<f32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%83 = tensor.empty() : tensor<1x1x16384xi64> | |
%84 = linalg.fill ins(%c0_i64 : i64) outs(%83 : tensor<1x1x16384xi64>) -> tensor<1x1x16384xi64> | |
%85 = tensor.empty() : tensor<1x1x16384xf32> | |
%86 = linalg.fill ins(%cst_1 : f32) outs(%85 : tensor<1x1x16384xf32>) -> tensor<1x1x16384xf32> | |
%87:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%82 : tensor<1x1x16384x256xf32>) outs(%86, %84 : tensor<1x1x16384xf32>, tensor<1x1x16384xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x1x16384xf32>, tensor<1x1x16384xi64>) | |
%expanded_211 = tensor.expand_shape %87#0 [[0], [1], [2, 3]] output_shape [1, 1, 16384, 1] : tensor<1x1x16384xf32> into tensor<1x1x16384x1xf32> | |
%88 = linalg.generic {indexing_maps = [#map11, #map14, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%82, %expanded_211 : tensor<1x1x16384x256xf32>, tensor<1x1x16384x1xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%89 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%88 : tensor<1x1x16384x256xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%90 = tensor.empty() : tensor<1x1x16384x1xf32> | |
%91 = linalg.fill ins(%cst_0 : f32) outs(%90 : tensor<1x1x16384x1xf32>) -> tensor<1x1x16384x1xf32> | |
%92 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%89 : tensor<1x1x16384x256xf32>) outs(%91 : tensor<1x1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x1xf32> | |
%93 = linalg.generic {indexing_maps = [#map11, #map14, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%89, %92 : tensor<1x1x16384x256xf32>, tensor<1x1x16384x1xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%94 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%93 : tensor<1x1x16384x256xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%95 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%72 : tensor<1x1x256x32xf32>) outs(%71 : tensor<1x1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x256x32xf32> | |
%collapsed_212 = tensor.collapse_shape %94 [[0, 1], [2], [3]] : tensor<1x1x16384x256xf32> into tensor<1x16384x256xf32> | |
%collapsed_213 = tensor.collapse_shape %95 [[0, 1], [2], [3]] : tensor<1x1x256x32xf32> into tensor<1x256x32xf32> | |
%96 = linalg.batch_matmul ins(%collapsed_212, %collapsed_213 : tensor<1x16384x256xf32>, tensor<1x256x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%expanded_214 = tensor.expand_shape %96 [[0, 1], [2], [3]] output_shape [1, 1, 16384, 32] : tensor<1x16384x32xf32> into tensor<1x1x16384x32xf32> | |
%97 = tensor.empty() : tensor<1x16384x1x32xf32> | |
%98 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_214 : tensor<1x1x16384x32xf32>) outs(%97 : tensor<1x16384x1x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x1x32xf32> | |
%collapsed_215 = tensor.collapse_shape %98 [[0], [1], [2, 3]] : tensor<1x16384x1x32xf32> into tensor<1x16384x32xf32> | |
%99 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_215 : tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%100 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_50 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%101 = linalg.batch_matmul ins(%99, %100 : tensor<1x16384x32xf32>, tensor<1x32x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%102 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_174, %101 : tensor<32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%103 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%102, %19 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%104 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%103 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%105 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%104 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%106 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%105 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%107 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%103, %106 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%108 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%107, %107 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%109 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%108 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%110 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%109 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%111 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%110 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%112 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%111 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%113 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%112 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%114 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%107, %113 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%115 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%114, %cst_173 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%116 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%115, %cst_172 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%117 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%116 : tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%118 = tensor.empty() : tensor<1x32x128xf32> | |
%119 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_49 : tensor<32x128xf32>) outs(%118 : tensor<1x32x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x128xf32> | |
%120 = tensor.empty() : tensor<1x16384x128xf32> | |
%121 = linalg.fill ins(%cst_0 : f32) outs(%120 : tensor<1x16384x128xf32>) -> tensor<1x16384x128xf32> | |
%122 = linalg.batch_matmul ins(%117, %119 : tensor<1x16384x32xf32>, tensor<1x32x128xf32>) outs(%121 : tensor<1x16384x128xf32>) -> tensor<1x16384x128xf32> | |
%123 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_171, %122 : tensor<128xf32>, tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%124 = tensor.empty() : tensor<1x128x16384xf32> | |
%125 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%123 : tensor<1x16384x128xf32>) outs(%124 : tensor<1x128x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x128x16384xf32> | |
%expanded_216 = tensor.expand_shape %125 [[0], [1], [2, 3]] output_shape [1, 128, 128, 128] : tensor<1x128x16384xf32> into tensor<1x128x128x128xf32> | |
%padded_217 = tensor.pad %expanded_216 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x128x128x128xf32> to tensor<1x128x130x130xf32> | |
%126 = tensor.empty() : tensor<1x128x128x128xf32> | |
%127 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_169 : tensor<128xf32>) outs(%126 : tensor<1x128x128x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x128x128x128xf32> | |
%collapsed_218 = tensor.collapse_shape %cst_170 [[0, 1], [2], [3]] : tensor<128x1x3x3xf32> into tensor<128x3x3xf32> | |
%128 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_217, %collapsed_218 : tensor<1x128x130x130xf32>, tensor<128x3x3xf32>) outs(%127 : tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32> | |
%collapsed_219 = tensor.collapse_shape %128 [[0], [1], [2, 3]] : tensor<1x128x128x128xf32> into tensor<1x128x16384xf32> | |
%129 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_219 : tensor<1x128x16384xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x128xf32> | |
%130 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%129, %cst_201 : tensor<1x16384x128xf32>, tensor<f32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%131 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%130 : tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%132 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%131, %cst_202 : tensor<1x16384x128xf32>, tensor<f32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%133 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%129, %132 : tensor<1x16384x128xf32>, tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%134 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%133, %cst_203 : tensor<1x16384x128xf32>, tensor<f32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%135 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%134 : tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x128xf32> | |
%136 = tensor.empty() : tensor<1x128x32xf32> | |
%137 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_48 : tensor<128x32xf32>) outs(%136 : tensor<1x128x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x128x32xf32> | |
%138 = linalg.batch_matmul ins(%135, %137 : tensor<1x16384x128xf32>, tensor<1x128x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%139 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_168, %138 : tensor<32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%140 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%139, %103 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%141 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%140 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%142 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%141 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%143 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%142 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%144 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%140, %143 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%145 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%144, %144 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%146 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%145 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%147 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%146 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%148 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%147 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%149 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%148 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%150 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%149 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%151 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%144, %150 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%152 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%151, %cst_167 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%153 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%152, %cst_166 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%154 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%153 : tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%155 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_47 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%156 = linalg.batch_matmul ins(%154, %155 : tensor<1x16384x32xf32>, tensor<1x32x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%157 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_165, %156 : tensor<32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%expanded_220 = tensor.expand_shape %157 [[0], [1], [2, 3]] output_shape [1, 16384, 1, 32] : tensor<1x16384x32xf32> into tensor<1x16384x1x32xf32> | |
%158 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_220 : tensor<1x16384x1x32xf32>) outs(%39 : tensor<1x1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x16384x32xf32> | |
%159 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%153 : tensor<1x16384x32xf32>) outs(%41 : tensor<1x32x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x16384xf32> | |
%expanded_221 = tensor.expand_shape %159 [[0], [1], [2, 3]] output_shape [1, 32, 128, 128] : tensor<1x32x16384xf32> into tensor<1x32x128x128xf32> | |
%160 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_161 : tensor<32xf32>) outs(%43 : tensor<1x32x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x16x16xf32> | |
%161 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<8> : vector<2xi64>} ins(%expanded_221, %cst_162 : tensor<1x32x128x128xf32>, tensor<32x32x8x8xf32>) outs(%160 : tensor<1x32x16x16xf32>) -> tensor<1x32x16x16xf32> | |
%collapsed_222 = tensor.collapse_shape %161 [[0], [1], [2, 3]] : tensor<1x32x16x16xf32> into tensor<1x32x256xf32> | |
%162 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_222 : tensor<1x32x256xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%163 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%162 : tensor<1x256x32xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%164 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%163 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%165 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%164 : tensor<1x256x1xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%166 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%162, %165 : tensor<1x256x32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%167 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%166, %166 : tensor<1x256x32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%168 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%167 : tensor<1x256x32xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%169 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%168 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%170 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%169 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%171 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%170 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%172 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%171 : tensor<1x256x1xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%173 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%166, %172 : tensor<1x256x32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%174 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%173, %cst_160 : tensor<1x256x32xf32>, tensor<32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%175 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%174, %cst_159 : tensor<1x256x32xf32>, tensor<32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%176 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%175 : tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x32xf32> | |
%177 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_46 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%178 = linalg.batch_matmul ins(%176, %177 : tensor<1x256x32xf32>, tensor<1x32x32xf32>) outs(%65 : tensor<1x256x32xf32>) -> tensor<1x256x32xf32> | |
%179 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_164, %178 : tensor<32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%expanded_223 = tensor.expand_shape %179 [[0], [1], [2, 3]] output_shape [1, 256, 1, 32] : tensor<1x256x32xf32> into tensor<1x256x1x32xf32> | |
%180 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_45 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%181 = linalg.batch_matmul ins(%176, %180 : tensor<1x256x32xf32>, tensor<1x32x32xf32>) outs(%65 : tensor<1x256x32xf32>) -> tensor<1x256x32xf32> | |
%182 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_163, %181 : tensor<32xf32>, tensor<1x256x32xf32>) outs(%46 : tensor<1x256x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x32xf32> | |
%expanded_224 = tensor.expand_shape %182 [[0], [1], [2, 3]] output_shape [1, 256, 1, 32] : tensor<1x256x32xf32> into tensor<1x256x1x32xf32> | |
%183 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_224 : tensor<1x256x1x32xf32>) outs(%71 : tensor<1x1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x256x32xf32> | |
%184 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_223 : tensor<1x256x1x32xf32>) outs(%71 : tensor<1x1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x256x32xf32> | |
%185 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%184 : tensor<1x1x256x32xf32>) outs(%74 : tensor<1x1x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x32x256xf32> | |
%186 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%158 : tensor<1x1x16384x32xf32>) outs(%39 : tensor<1x1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x16384x32xf32> | |
%187 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%185 : tensor<1x1x32x256xf32>) outs(%74 : tensor<1x1x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x32x256xf32> | |
%collapsed_225 = tensor.collapse_shape %186 [[0, 1], [2], [3]] : tensor<1x1x16384x32xf32> into tensor<1x16384x32xf32> | |
%collapsed_226 = tensor.collapse_shape %187 [[0, 1], [2], [3]] : tensor<1x1x32x256xf32> into tensor<1x32x256xf32> | |
%188 = linalg.batch_matmul ins(%collapsed_225, %collapsed_226 : tensor<1x16384x32xf32>, tensor<1x32x256xf32>) outs(%79 : tensor<1x16384x256xf32>) -> tensor<1x16384x256xf32> | |
%expanded_227 = tensor.expand_shape %188 [[0, 1], [2], [3]] output_shape [1, 1, 16384, 256] : tensor<1x16384x256xf32> into tensor<1x1x16384x256xf32> | |
%189 = linalg.generic {indexing_maps = [#map11, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_227, %cst_200 : tensor<1x1x16384x256xf32>, tensor<f32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%190:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%189 : tensor<1x1x16384x256xf32>) outs(%86, %84 : tensor<1x1x16384xf32>, tensor<1x1x16384xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x1x16384xf32>, tensor<1x1x16384xi64>) | |
%expanded_228 = tensor.expand_shape %190#0 [[0], [1], [2, 3]] output_shape [1, 1, 16384, 1] : tensor<1x1x16384xf32> into tensor<1x1x16384x1xf32> | |
%191 = linalg.generic {indexing_maps = [#map11, #map14, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%189, %expanded_228 : tensor<1x1x16384x256xf32>, tensor<1x1x16384x1xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%192 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%191 : tensor<1x1x16384x256xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%193 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%192 : tensor<1x1x16384x256xf32>) outs(%91 : tensor<1x1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x1xf32> | |
%194 = linalg.generic {indexing_maps = [#map11, #map14, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%192, %193 : tensor<1x1x16384x256xf32>, tensor<1x1x16384x1xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%195 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%194 : tensor<1x1x16384x256xf32>) outs(%81 : tensor<1x1x16384x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x16384x256xf32> | |
%196 = linalg.generic {indexing_maps = [#map11, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%183 : tensor<1x1x256x32xf32>) outs(%71 : tensor<1x1x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1x256x32xf32> | |
%collapsed_229 = tensor.collapse_shape %195 [[0, 1], [2], [3]] : tensor<1x1x16384x256xf32> into tensor<1x16384x256xf32> | |
%collapsed_230 = tensor.collapse_shape %196 [[0, 1], [2], [3]] : tensor<1x1x256x32xf32> into tensor<1x256x32xf32> | |
%197 = linalg.batch_matmul ins(%collapsed_229, %collapsed_230 : tensor<1x16384x256xf32>, tensor<1x256x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%expanded_231 = tensor.expand_shape %197 [[0, 1], [2], [3]] output_shape [1, 1, 16384, 32] : tensor<1x16384x32xf32> into tensor<1x1x16384x32xf32> | |
%198 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_231 : tensor<1x1x16384x32xf32>) outs(%97 : tensor<1x16384x1x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x1x32xf32> | |
%collapsed_232 = tensor.collapse_shape %198 [[0], [1], [2, 3]] : tensor<1x16384x1x32xf32> into tensor<1x16384x32xf32> | |
%199 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_232 : tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%200 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_44 : tensor<32x32xf32>) outs(%34 : tensor<1x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x32xf32> | |
%201 = linalg.batch_matmul ins(%199, %200 : tensor<1x16384x32xf32>, tensor<1x32x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%202 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_158, %201 : tensor<32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%203 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%202, %140 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%204 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%203 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%205 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%204 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%206 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%205 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%207 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%203, %206 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%208 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%207, %207 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%209 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%208 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%210 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%209 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%211 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%210 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%212 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%211 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%213 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%212 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%214 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%207, %213 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%215 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%214, %cst_157 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%216 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%215, %cst_156 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%217 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%216 : tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%218 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_43 : tensor<32x128xf32>) outs(%118 : tensor<1x32x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x128xf32> | |
%219 = linalg.batch_matmul ins(%217, %218 : tensor<1x16384x32xf32>, tensor<1x32x128xf32>) outs(%121 : tensor<1x16384x128xf32>) -> tensor<1x16384x128xf32> | |
%220 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_155, %219 : tensor<128xf32>, tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%221 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%220 : tensor<1x16384x128xf32>) outs(%124 : tensor<1x128x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x128x16384xf32> | |
%expanded_233 = tensor.expand_shape %221 [[0], [1], [2, 3]] output_shape [1, 128, 128, 128] : tensor<1x128x16384xf32> into tensor<1x128x128x128xf32> | |
%padded_234 = tensor.pad %expanded_233 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x128x128x128xf32> to tensor<1x128x130x130xf32> | |
%222 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_153 : tensor<128xf32>) outs(%126 : tensor<1x128x128x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x128x128x128xf32> | |
%collapsed_235 = tensor.collapse_shape %cst_154 [[0, 1], [2], [3]] : tensor<128x1x3x3xf32> into tensor<128x3x3xf32> | |
%223 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_234, %collapsed_235 : tensor<1x128x130x130xf32>, tensor<128x3x3xf32>) outs(%222 : tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32> | |
%collapsed_236 = tensor.collapse_shape %223 [[0], [1], [2, 3]] : tensor<1x128x128x128xf32> into tensor<1x128x16384xf32> | |
%224 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_236 : tensor<1x128x16384xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x128xf32> | |
%225 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%224, %cst_201 : tensor<1x16384x128xf32>, tensor<f32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%226 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%225 : tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%227 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%226, %cst_202 : tensor<1x16384x128xf32>, tensor<f32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%228 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%224, %227 : tensor<1x16384x128xf32>, tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%229 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%228, %cst_203 : tensor<1x16384x128xf32>, tensor<f32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x128xf32> | |
%230 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%229 : tensor<1x16384x128xf32>) outs(%120 : tensor<1x16384x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x128xf32> | |
%231 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_42 : tensor<128x32xf32>) outs(%136 : tensor<1x128x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x128x32xf32> | |
%232 = linalg.batch_matmul ins(%230, %231 : tensor<1x16384x128xf32>, tensor<1x128x32xf32>) outs(%36 : tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32> | |
%233 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_152, %232 : tensor<32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%234 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%233, %203 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%235 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%234 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%236 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%235 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%237 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%236 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%238 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%234, %237 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%239 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%238, %238 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%240 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%239 : tensor<1x16384x32xf32>) outs(%6 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%241 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%240 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%242 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%241 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%243 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%242 : tensor<1x16384x1xf32>) outs(%5 : tensor<1x16384x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x1xf32> | |
%244 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%243 : tensor<1x16384x1xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x16384x32xf32> | |
%245 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%238, %244 : tensor<1x16384x32xf32>, tensor<1x16384x32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%246 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%245, %cst_63 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%247 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%246, %cst_62 : tensor<1x16384x32xf32>, tensor<32xf32>) outs(%3 : tensor<1x16384x32xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x16384x32xf32> | |
%expanded_237 = tensor.expand_shape %247 [[0], [1, 2], [3]] output_shape [1, 128, 128, 32] : tensor<1x16384x32xf32> into tensor<1x128x128x32xf32> | |
%248 = linalg.generic {indexing_maps = [#map1, #map17], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_237 : tensor<1x128x128x32xf32>) outs(%0 : tensor<1x32x128x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x128x128xf32> | |
%249 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%248 : tensor<1x32x128x128xf32>) outs(%0 : tensor<1x32x128x128xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x32x128x128xf32> | |
%padded_238 = tensor.pad %249 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x32x128x128xf32> to tensor<1x32x130x130xf32> | |
%250 = tensor.empty() : tensor<1x64x64x64xf32> | |
%251 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_194 : tensor<64xf32>) outs(%250 : tensor<1x64x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64x64xf32> | |
%252 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_238, %cst_195 : tensor<1x32x130x130xf32>, tensor<64x32x3x3xf32>) outs(%251 : tensor<1x64x64x64xf32>) -> tensor<1x64x64x64xf32> | |
%collapsed_239 = tensor.collapse_shape %252 [[0], [1], [2, 3]] : tensor<1x64x64x64xf32> into tensor<1x64x4096xf32> | |
%253 = tensor.empty() : tensor<1x4096x64xf32> | |
%254 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_239 : tensor<1x64x4096xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%255 = tensor.empty() : tensor<1x4096x1xf32> | |
%256 = linalg.fill ins(%cst_0 : f32) outs(%255 : tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32> | |
%257 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%254 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%258 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%257 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%259 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%258 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%260 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%254, %259 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%261 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%260, %260 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%262 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%261 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%263 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%262 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%264 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%263 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%265 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%264 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%266 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%265 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%267 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%260, %266 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%268 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%267, %cst_193 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%269 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%268, %cst_192 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%270 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%269 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%271 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%270 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%272 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%271 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%273 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%269, %272 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%274 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%273, %273 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%275 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%274 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%276 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%275 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%277 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%276 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%278 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%277 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%279 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%278 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%280 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%273, %279 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%281 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%280, %cst_151 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%282 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%281, %cst_150 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%283 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%282 : tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%284 = tensor.empty() : tensor<1x64x64xf32> | |
%285 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_41 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%286 = linalg.fill ins(%cst_0 : f32) outs(%253 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%287 = linalg.batch_matmul ins(%283, %285 : tensor<1x4096x64xf32>, tensor<1x64x64xf32>) outs(%286 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%288 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_149, %287 : tensor<64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%expanded_240 = tensor.expand_shape %288 [[0], [1], [2, 3]] output_shape [1, 4096, 2, 32] : tensor<1x4096x64xf32> into tensor<1x4096x2x32xf32> | |
%289 = tensor.empty() : tensor<1x2x4096x32xf32> | |
%290 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_240 : tensor<1x4096x2x32xf32>) outs(%289 : tensor<1x2x4096x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x4096x32xf32> | |
%291 = tensor.empty() : tensor<1x64x4096xf32> | |
%292 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%282 : tensor<1x4096x64xf32>) outs(%291 : tensor<1x64x4096xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x4096xf32> | |
%expanded_241 = tensor.expand_shape %292 [[0], [1], [2, 3]] output_shape [1, 64, 64, 64] : tensor<1x64x4096xf32> into tensor<1x64x64x64xf32> | |
%293 = tensor.empty() : tensor<1x64x16x16xf32> | |
%294 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_145 : tensor<64xf32>) outs(%293 : tensor<1x64x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x16x16xf32> | |
%295 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<4> : vector<2xi64>} ins(%expanded_241, %cst_146 : tensor<1x64x64x64xf32>, tensor<64x64x4x4xf32>) outs(%294 : tensor<1x64x16x16xf32>) -> tensor<1x64x16x16xf32> | |
%collapsed_242 = tensor.collapse_shape %295 [[0], [1], [2, 3]] : tensor<1x64x16x16xf32> into tensor<1x64x256xf32> | |
%296 = tensor.empty() : tensor<1x256x64xf32> | |
%297 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_242 : tensor<1x64x256xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%298 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%297 : tensor<1x256x64xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%299 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%298 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%300 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%299 : tensor<1x256x1xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%301 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%297, %300 : tensor<1x256x64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%302 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%301, %301 : tensor<1x256x64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%303 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%302 : tensor<1x256x64xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%304 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%303 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%305 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%304 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%306 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%305 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%307 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%306 : tensor<1x256x1xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%308 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%301, %307 : tensor<1x256x64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%309 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%308, %cst_144 : tensor<1x256x64xf32>, tensor<64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%310 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%309, %cst_143 : tensor<1x256x64xf32>, tensor<64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%311 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%310 : tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%312 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_40 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%313 = linalg.fill ins(%cst_0 : f32) outs(%296 : tensor<1x256x64xf32>) -> tensor<1x256x64xf32> | |
%314 = linalg.batch_matmul ins(%311, %312 : tensor<1x256x64xf32>, tensor<1x64x64xf32>) outs(%313 : tensor<1x256x64xf32>) -> tensor<1x256x64xf32> | |
%315 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_148, %314 : tensor<64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%expanded_243 = tensor.expand_shape %315 [[0], [1], [2, 3]] output_shape [1, 256, 2, 32] : tensor<1x256x64xf32> into tensor<1x256x2x32xf32> | |
%316 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_39 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%317 = linalg.batch_matmul ins(%311, %316 : tensor<1x256x64xf32>, tensor<1x64x64xf32>) outs(%313 : tensor<1x256x64xf32>) -> tensor<1x256x64xf32> | |
%318 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_147, %317 : tensor<64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%expanded_244 = tensor.expand_shape %318 [[0], [1], [2, 3]] output_shape [1, 256, 2, 32] : tensor<1x256x64xf32> into tensor<1x256x2x32xf32> | |
%319 = tensor.empty() : tensor<1x2x256x32xf32> | |
%320 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_244 : tensor<1x256x2x32xf32>) outs(%319 : tensor<1x2x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x256x32xf32> | |
%321 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_243 : tensor<1x256x2x32xf32>) outs(%319 : tensor<1x2x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x256x32xf32> | |
%322 = tensor.empty() : tensor<1x2x32x256xf32> | |
%323 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%321 : tensor<1x2x256x32xf32>) outs(%322 : tensor<1x2x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x32x256xf32> | |
%324 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%290 : tensor<1x2x4096x32xf32>) outs(%289 : tensor<1x2x4096x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x4096x32xf32> | |
%325 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%323 : tensor<1x2x32x256xf32>) outs(%322 : tensor<1x2x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x32x256xf32> | |
%collapsed_245 = tensor.collapse_shape %324 [[0, 1], [2], [3]] : tensor<1x2x4096x32xf32> into tensor<2x4096x32xf32> | |
%collapsed_246 = tensor.collapse_shape %325 [[0, 1], [2], [3]] : tensor<1x2x32x256xf32> into tensor<2x32x256xf32> | |
%326 = tensor.empty() : tensor<2x4096x256xf32> | |
%327 = linalg.fill ins(%cst_0 : f32) outs(%326 : tensor<2x4096x256xf32>) -> tensor<2x4096x256xf32> | |
%328 = linalg.batch_matmul ins(%collapsed_245, %collapsed_246 : tensor<2x4096x32xf32>, tensor<2x32x256xf32>) outs(%327 : tensor<2x4096x256xf32>) -> tensor<2x4096x256xf32> | |
%expanded_247 = tensor.expand_shape %328 [[0, 1], [2], [3]] output_shape [1, 2, 4096, 256] : tensor<2x4096x256xf32> into tensor<1x2x4096x256xf32> | |
%329 = tensor.empty() : tensor<1x2x4096x256xf32> | |
%330 = linalg.generic {indexing_maps = [#map18, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_247, %cst_200 : tensor<1x2x4096x256xf32>, tensor<f32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%331 = tensor.empty() : tensor<1x2x4096xi64> | |
%332 = linalg.fill ins(%c0_i64 : i64) outs(%331 : tensor<1x2x4096xi64>) -> tensor<1x2x4096xi64> | |
%333 = tensor.empty() : tensor<1x2x4096xf32> | |
%334 = linalg.fill ins(%cst_1 : f32) outs(%333 : tensor<1x2x4096xf32>) -> tensor<1x2x4096xf32> | |
%335:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%330 : tensor<1x2x4096x256xf32>) outs(%334, %332 : tensor<1x2x4096xf32>, tensor<1x2x4096xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x2x4096xf32>, tensor<1x2x4096xi64>) | |
%expanded_248 = tensor.expand_shape %335#0 [[0], [1], [2, 3]] output_shape [1, 2, 4096, 1] : tensor<1x2x4096xf32> into tensor<1x2x4096x1xf32> | |
%336 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%330, %expanded_248 : tensor<1x2x4096x256xf32>, tensor<1x2x4096x1xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%337 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%336 : tensor<1x2x4096x256xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%338 = tensor.empty() : tensor<1x2x4096x1xf32> | |
%339 = linalg.fill ins(%cst_0 : f32) outs(%338 : tensor<1x2x4096x1xf32>) -> tensor<1x2x4096x1xf32> | |
%340 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%337 : tensor<1x2x4096x256xf32>) outs(%339 : tensor<1x2x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x1xf32> | |
%341 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%337, %340 : tensor<1x2x4096x256xf32>, tensor<1x2x4096x1xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%342 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%341 : tensor<1x2x4096x256xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%343 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%320 : tensor<1x2x256x32xf32>) outs(%319 : tensor<1x2x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x256x32xf32> | |
%collapsed_249 = tensor.collapse_shape %342 [[0, 1], [2], [3]] : tensor<1x2x4096x256xf32> into tensor<2x4096x256xf32> | |
%collapsed_250 = tensor.collapse_shape %343 [[0, 1], [2], [3]] : tensor<1x2x256x32xf32> into tensor<2x256x32xf32> | |
%344 = tensor.empty() : tensor<2x4096x32xf32> | |
%345 = linalg.fill ins(%cst_0 : f32) outs(%344 : tensor<2x4096x32xf32>) -> tensor<2x4096x32xf32> | |
%346 = linalg.batch_matmul ins(%collapsed_249, %collapsed_250 : tensor<2x4096x256xf32>, tensor<2x256x32xf32>) outs(%345 : tensor<2x4096x32xf32>) -> tensor<2x4096x32xf32> | |
%expanded_251 = tensor.expand_shape %346 [[0, 1], [2], [3]] output_shape [1, 2, 4096, 32] : tensor<2x4096x32xf32> into tensor<1x2x4096x32xf32> | |
%347 = tensor.empty() : tensor<1x4096x2x32xf32> | |
%348 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_251 : tensor<1x2x4096x32xf32>) outs(%347 : tensor<1x4096x2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x2x32xf32> | |
%collapsed_252 = tensor.collapse_shape %348 [[0], [1], [2, 3]] : tensor<1x4096x2x32xf32> into tensor<1x4096x64xf32> | |
%349 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_252 : tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%350 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_38 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%351 = linalg.batch_matmul ins(%349, %350 : tensor<1x4096x64xf32>, tensor<1x64x64xf32>) outs(%286 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%352 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_142, %351 : tensor<64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%353 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%352, %269 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%354 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%353 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%355 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%354 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%356 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%355 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%357 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%353, %356 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%358 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%357, %357 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%359 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%358 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%360 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%359 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%361 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%360 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%362 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%361 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%363 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%362 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%364 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%357, %363 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%365 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%364, %cst_141 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%366 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%365, %cst_140 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%367 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%366 : tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%368 = tensor.empty() : tensor<1x64x256xf32> | |
%369 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_37 : tensor<64x256xf32>) outs(%368 : tensor<1x64x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x256xf32> | |
%370 = tensor.empty() : tensor<1x4096x256xf32> | |
%371 = linalg.fill ins(%cst_0 : f32) outs(%370 : tensor<1x4096x256xf32>) -> tensor<1x4096x256xf32> | |
%372 = linalg.batch_matmul ins(%367, %369 : tensor<1x4096x64xf32>, tensor<1x64x256xf32>) outs(%371 : tensor<1x4096x256xf32>) -> tensor<1x4096x256xf32> | |
%373 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_139, %372 : tensor<256xf32>, tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%374 = tensor.empty() : tensor<1x256x4096xf32> | |
%375 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%373 : tensor<1x4096x256xf32>) outs(%374 : tensor<1x256x4096xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x4096xf32> | |
%expanded_253 = tensor.expand_shape %375 [[0], [1], [2, 3]] output_shape [1, 256, 64, 64] : tensor<1x256x4096xf32> into tensor<1x256x64x64xf32> | |
%padded_254 = tensor.pad %expanded_253 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x256x64x64xf32> to tensor<1x256x66x66xf32> | |
%376 = tensor.empty() : tensor<1x256x64x64xf32> | |
%377 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_137 : tensor<256xf32>) outs(%376 : tensor<1x256x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64x64xf32> | |
%collapsed_255 = tensor.collapse_shape %cst_138 [[0, 1], [2], [3]] : tensor<256x1x3x3xf32> into tensor<256x3x3xf32> | |
%378 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_254, %collapsed_255 : tensor<1x256x66x66xf32>, tensor<256x3x3xf32>) outs(%377 : tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32> | |
%collapsed_256 = tensor.collapse_shape %378 [[0], [1], [2, 3]] : tensor<1x256x64x64xf32> into tensor<1x256x4096xf32> | |
%379 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_256 : tensor<1x256x4096xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x256xf32> | |
%380 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%379, %cst_201 : tensor<1x4096x256xf32>, tensor<f32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%381 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%380 : tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%382 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%381, %cst_202 : tensor<1x4096x256xf32>, tensor<f32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%383 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%379, %382 : tensor<1x4096x256xf32>, tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%384 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%383, %cst_203 : tensor<1x4096x256xf32>, tensor<f32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%385 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%384 : tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x256xf32> | |
%386 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_36 : tensor<256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%387 = linalg.batch_matmul ins(%385, %386 : tensor<1x4096x256xf32>, tensor<1x256x64xf32>) outs(%286 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%388 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_136, %387 : tensor<64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%389 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%388, %353 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%390 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%389 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%391 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%390 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%392 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%391 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%393 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%389, %392 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%394 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%393, %393 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%395 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%394 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%396 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%395 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%397 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%396 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%398 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%397 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%399 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%398 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%400 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%393, %399 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%401 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%400, %cst_135 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%402 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%401, %cst_134 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%403 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%402 : tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%404 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_35 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%405 = linalg.batch_matmul ins(%403, %404 : tensor<1x4096x64xf32>, tensor<1x64x64xf32>) outs(%286 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%406 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_133, %405 : tensor<64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%expanded_257 = tensor.expand_shape %406 [[0], [1], [2, 3]] output_shape [1, 4096, 2, 32] : tensor<1x4096x64xf32> into tensor<1x4096x2x32xf32> | |
%407 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_257 : tensor<1x4096x2x32xf32>) outs(%289 : tensor<1x2x4096x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x4096x32xf32> | |
%408 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%402 : tensor<1x4096x64xf32>) outs(%291 : tensor<1x64x4096xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x4096xf32> | |
%expanded_258 = tensor.expand_shape %408 [[0], [1], [2, 3]] output_shape [1, 64, 64, 64] : tensor<1x64x4096xf32> into tensor<1x64x64x64xf32> | |
%409 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_129 : tensor<64xf32>) outs(%293 : tensor<1x64x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x16x16xf32> | |
%410 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<4> : vector<2xi64>} ins(%expanded_258, %cst_130 : tensor<1x64x64x64xf32>, tensor<64x64x4x4xf32>) outs(%409 : tensor<1x64x16x16xf32>) -> tensor<1x64x16x16xf32> | |
%collapsed_259 = tensor.collapse_shape %410 [[0], [1], [2, 3]] : tensor<1x64x16x16xf32> into tensor<1x64x256xf32> | |
%411 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_259 : tensor<1x64x256xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%412 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%411 : tensor<1x256x64xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%413 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%412 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%414 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%413 : tensor<1x256x1xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%415 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%411, %414 : tensor<1x256x64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%416 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%415, %415 : tensor<1x256x64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%417 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%416 : tensor<1x256x64xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%418 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%417 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%419 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%418 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%420 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%419 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%421 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%420 : tensor<1x256x1xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%422 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%415, %421 : tensor<1x256x64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%423 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%422, %cst_128 : tensor<1x256x64xf32>, tensor<64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%424 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%423, %cst_127 : tensor<1x256x64xf32>, tensor<64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%425 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%424 : tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%426 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_34 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%427 = linalg.batch_matmul ins(%425, %426 : tensor<1x256x64xf32>, tensor<1x64x64xf32>) outs(%313 : tensor<1x256x64xf32>) -> tensor<1x256x64xf32> | |
%428 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_132, %427 : tensor<64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%expanded_260 = tensor.expand_shape %428 [[0], [1], [2, 3]] output_shape [1, 256, 2, 32] : tensor<1x256x64xf32> into tensor<1x256x2x32xf32> | |
%429 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_33 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%430 = linalg.batch_matmul ins(%425, %429 : tensor<1x256x64xf32>, tensor<1x64x64xf32>) outs(%313 : tensor<1x256x64xf32>) -> tensor<1x256x64xf32> | |
%431 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_131, %430 : tensor<64xf32>, tensor<1x256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x64xf32> | |
%expanded_261 = tensor.expand_shape %431 [[0], [1], [2, 3]] output_shape [1, 256, 2, 32] : tensor<1x256x64xf32> into tensor<1x256x2x32xf32> | |
%432 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_261 : tensor<1x256x2x32xf32>) outs(%319 : tensor<1x2x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x256x32xf32> | |
%433 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_260 : tensor<1x256x2x32xf32>) outs(%319 : tensor<1x2x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x256x32xf32> | |
%434 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%433 : tensor<1x2x256x32xf32>) outs(%322 : tensor<1x2x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x32x256xf32> | |
%435 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%407 : tensor<1x2x4096x32xf32>) outs(%289 : tensor<1x2x4096x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x4096x32xf32> | |
%436 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%434 : tensor<1x2x32x256xf32>) outs(%322 : tensor<1x2x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x32x256xf32> | |
%collapsed_262 = tensor.collapse_shape %435 [[0, 1], [2], [3]] : tensor<1x2x4096x32xf32> into tensor<2x4096x32xf32> | |
%collapsed_263 = tensor.collapse_shape %436 [[0, 1], [2], [3]] : tensor<1x2x32x256xf32> into tensor<2x32x256xf32> | |
%437 = linalg.batch_matmul ins(%collapsed_262, %collapsed_263 : tensor<2x4096x32xf32>, tensor<2x32x256xf32>) outs(%327 : tensor<2x4096x256xf32>) -> tensor<2x4096x256xf32> | |
%expanded_264 = tensor.expand_shape %437 [[0, 1], [2], [3]] output_shape [1, 2, 4096, 256] : tensor<2x4096x256xf32> into tensor<1x2x4096x256xf32> | |
%438 = linalg.generic {indexing_maps = [#map18, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_264, %cst_200 : tensor<1x2x4096x256xf32>, tensor<f32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%439:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%438 : tensor<1x2x4096x256xf32>) outs(%334, %332 : tensor<1x2x4096xf32>, tensor<1x2x4096xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x2x4096xf32>, tensor<1x2x4096xi64>) | |
%expanded_265 = tensor.expand_shape %439#0 [[0], [1], [2, 3]] output_shape [1, 2, 4096, 1] : tensor<1x2x4096xf32> into tensor<1x2x4096x1xf32> | |
%440 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%438, %expanded_265 : tensor<1x2x4096x256xf32>, tensor<1x2x4096x1xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%441 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%440 : tensor<1x2x4096x256xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%442 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%441 : tensor<1x2x4096x256xf32>) outs(%339 : tensor<1x2x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x1xf32> | |
%443 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%441, %442 : tensor<1x2x4096x256xf32>, tensor<1x2x4096x1xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%444 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%443 : tensor<1x2x4096x256xf32>) outs(%329 : tensor<1x2x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x4096x256xf32> | |
%445 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%432 : tensor<1x2x256x32xf32>) outs(%319 : tensor<1x2x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x2x256x32xf32> | |
%collapsed_266 = tensor.collapse_shape %444 [[0, 1], [2], [3]] : tensor<1x2x4096x256xf32> into tensor<2x4096x256xf32> | |
%collapsed_267 = tensor.collapse_shape %445 [[0, 1], [2], [3]] : tensor<1x2x256x32xf32> into tensor<2x256x32xf32> | |
%446 = linalg.batch_matmul ins(%collapsed_266, %collapsed_267 : tensor<2x4096x256xf32>, tensor<2x256x32xf32>) outs(%345 : tensor<2x4096x32xf32>) -> tensor<2x4096x32xf32> | |
%expanded_268 = tensor.expand_shape %446 [[0, 1], [2], [3]] output_shape [1, 2, 4096, 32] : tensor<2x4096x32xf32> into tensor<1x2x4096x32xf32> | |
%447 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_268 : tensor<1x2x4096x32xf32>) outs(%347 : tensor<1x4096x2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x2x32xf32> | |
%collapsed_269 = tensor.collapse_shape %447 [[0], [1], [2, 3]] : tensor<1x4096x2x32xf32> into tensor<1x4096x64xf32> | |
%448 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_269 : tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%449 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_32 : tensor<64x64xf32>) outs(%284 : tensor<1x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64xf32> | |
%450 = linalg.batch_matmul ins(%448, %449 : tensor<1x4096x64xf32>, tensor<1x64x64xf32>) outs(%286 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%451 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_126, %450 : tensor<64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%452 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%451, %389 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%453 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%452 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%454 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%453 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%455 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%454 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%456 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%452, %455 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%457 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%456, %456 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%458 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%457 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%459 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%458 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%460 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%459 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%461 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%460 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%462 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%461 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%463 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%456, %462 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%464 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%463, %cst_125 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%465 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%464, %cst_124 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%466 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%465 : tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%467 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_31 : tensor<64x256xf32>) outs(%368 : tensor<1x64x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x256xf32> | |
%468 = linalg.batch_matmul ins(%466, %467 : tensor<1x4096x64xf32>, tensor<1x64x256xf32>) outs(%371 : tensor<1x4096x256xf32>) -> tensor<1x4096x256xf32> | |
%469 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_123, %468 : tensor<256xf32>, tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%470 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%469 : tensor<1x4096x256xf32>) outs(%374 : tensor<1x256x4096xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x4096xf32> | |
%expanded_270 = tensor.expand_shape %470 [[0], [1], [2, 3]] output_shape [1, 256, 64, 64] : tensor<1x256x4096xf32> into tensor<1x256x64x64xf32> | |
%padded_271 = tensor.pad %expanded_270 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x256x64x64xf32> to tensor<1x256x66x66xf32> | |
%471 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_121 : tensor<256xf32>) outs(%376 : tensor<1x256x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64x64xf32> | |
%collapsed_272 = tensor.collapse_shape %cst_122 [[0, 1], [2], [3]] : tensor<256x1x3x3xf32> into tensor<256x3x3xf32> | |
%472 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_271, %collapsed_272 : tensor<1x256x66x66xf32>, tensor<256x3x3xf32>) outs(%471 : tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32> | |
%collapsed_273 = tensor.collapse_shape %472 [[0], [1], [2, 3]] : tensor<1x256x64x64xf32> into tensor<1x256x4096xf32> | |
%473 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_273 : tensor<1x256x4096xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x256xf32> | |
%474 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473, %cst_201 : tensor<1x4096x256xf32>, tensor<f32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%475 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%474 : tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%476 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%475, %cst_202 : tensor<1x4096x256xf32>, tensor<f32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%477 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%473, %476 : tensor<1x4096x256xf32>, tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%478 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%477, %cst_203 : tensor<1x4096x256xf32>, tensor<f32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x256xf32> | |
%479 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%478 : tensor<1x4096x256xf32>) outs(%370 : tensor<1x4096x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x256xf32> | |
%480 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_30 : tensor<256x64xf32>) outs(%296 : tensor<1x256x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x64xf32> | |
%481 = linalg.batch_matmul ins(%479, %480 : tensor<1x4096x256xf32>, tensor<1x256x64xf32>) outs(%286 : tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32> | |
%482 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_120, %481 : tensor<64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%483 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%482, %452 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%484 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%483 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%485 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%484 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%486 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%485 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%487 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%483, %486 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%488 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%487, %487 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%489 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%488 : tensor<1x4096x64xf32>) outs(%256 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%490 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%489 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_4 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%491 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%490 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%492 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%491 : tensor<1x4096x1xf32>) outs(%255 : tensor<1x4096x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x1xf32> | |
%493 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%492 : tensor<1x4096x1xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x4096x64xf32> | |
%494 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%487, %493 : tensor<1x4096x64xf32>, tensor<1x4096x64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%495 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%494, %cst_61 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%496 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%495, %cst_60 : tensor<1x4096x64xf32>, tensor<64xf32>) outs(%253 : tensor<1x4096x64xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x4096x64xf32> | |
%expanded_274 = tensor.expand_shape %496 [[0], [1, 2], [3]] output_shape [1, 64, 64, 64] : tensor<1x4096x64xf32> into tensor<1x64x64x64xf32> | |
%497 = linalg.generic {indexing_maps = [#map1, #map17], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_274 : tensor<1x64x64x64xf32>) outs(%250 : tensor<1x64x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64x64xf32> | |
%498 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%497 : tensor<1x64x64x64xf32>) outs(%250 : tensor<1x64x64x64xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x64x64x64xf32> | |
%padded_275 = tensor.pad %498 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x64x64x64xf32> to tensor<1x64x66x66xf32> | |
%499 = tensor.empty() : tensor<1x160x32x32xf32> | |
%500 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_190 : tensor<160xf32>) outs(%499 : tensor<1x160x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x32x32xf32> | |
%501 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_275, %cst_191 : tensor<1x64x66x66xf32>, tensor<160x64x3x3xf32>) outs(%500 : tensor<1x160x32x32xf32>) -> tensor<1x160x32x32xf32> | |
%collapsed_276 = tensor.collapse_shape %501 [[0], [1], [2, 3]] : tensor<1x160x32x32xf32> into tensor<1x160x1024xf32> | |
%502 = tensor.empty() : tensor<1x1024x160xf32> | |
%503 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_276 : tensor<1x160x1024xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%504 = tensor.empty() : tensor<1x1024x1xf32> | |
%505 = linalg.fill ins(%cst_0 : f32) outs(%504 : tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32> | |
%506 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%503 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%507 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%506 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%508 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%507 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%509 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%503, %508 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%510 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%509, %509 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%511 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%510 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%512 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%511 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%513 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%512 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%514 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%513 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%515 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%514 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%516 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%509, %515 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%517 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%516, %cst_189 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%518 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%517, %cst_188 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%519 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%518 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%520 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%519 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%521 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%520 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%522 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%518, %521 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%523 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%522, %522 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%524 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%523 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%525 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%524 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%526 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%525 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%527 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%526 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%528 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%527 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%529 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%522, %528 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%530 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%529, %cst_119 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%531 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%530, %cst_118 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%532 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%531 : tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%533 = tensor.empty() : tensor<1x160x160xf32> | |
%534 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_29 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%535 = linalg.fill ins(%cst_0 : f32) outs(%502 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%536 = linalg.batch_matmul ins(%532, %534 : tensor<1x1024x160xf32>, tensor<1x160x160xf32>) outs(%535 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%537 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_117, %536 : tensor<160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%expanded_277 = tensor.expand_shape %537 [[0], [1], [2, 3]] output_shape [1, 1024, 5, 32] : tensor<1x1024x160xf32> into tensor<1x1024x5x32xf32> | |
%538 = tensor.empty() : tensor<1x5x1024x32xf32> | |
%539 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_277 : tensor<1x1024x5x32xf32>) outs(%538 : tensor<1x5x1024x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x1024x32xf32> | |
%540 = tensor.empty() : tensor<1x160x1024xf32> | |
%541 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%531 : tensor<1x1024x160xf32>) outs(%540 : tensor<1x160x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x1024xf32> | |
%expanded_278 = tensor.expand_shape %541 [[0], [1], [2, 3]] output_shape [1, 160, 32, 32] : tensor<1x160x1024xf32> into tensor<1x160x32x32xf32> | |
%542 = tensor.empty() : tensor<1x160x16x16xf32> | |
%543 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_113 : tensor<160xf32>) outs(%542 : tensor<1x160x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x16x16xf32> | |
%544 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%expanded_278, %cst_114 : tensor<1x160x32x32xf32>, tensor<160x160x2x2xf32>) outs(%543 : tensor<1x160x16x16xf32>) -> tensor<1x160x16x16xf32> | |
%collapsed_279 = tensor.collapse_shape %544 [[0], [1], [2, 3]] : tensor<1x160x16x16xf32> into tensor<1x160x256xf32> | |
%545 = tensor.empty() : tensor<1x256x160xf32> | |
%546 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_279 : tensor<1x160x256xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%547 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%546 : tensor<1x256x160xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%548 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%547 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%549 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%548 : tensor<1x256x1xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%550 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%546, %549 : tensor<1x256x160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%551 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%550, %550 : tensor<1x256x160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%552 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%551 : tensor<1x256x160xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%553 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%552 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%554 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%553 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%555 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%554 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%556 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%555 : tensor<1x256x1xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%557 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%550, %556 : tensor<1x256x160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%558 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%557, %cst_112 : tensor<1x256x160xf32>, tensor<160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%559 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%558, %cst_111 : tensor<1x256x160xf32>, tensor<160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%560 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%559 : tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%561 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_28 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%562 = linalg.fill ins(%cst_0 : f32) outs(%545 : tensor<1x256x160xf32>) -> tensor<1x256x160xf32> | |
%563 = linalg.batch_matmul ins(%560, %561 : tensor<1x256x160xf32>, tensor<1x160x160xf32>) outs(%562 : tensor<1x256x160xf32>) -> tensor<1x256x160xf32> | |
%564 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_116, %563 : tensor<160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%expanded_280 = tensor.expand_shape %564 [[0], [1], [2, 3]] output_shape [1, 256, 5, 32] : tensor<1x256x160xf32> into tensor<1x256x5x32xf32> | |
%565 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_27 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%566 = linalg.batch_matmul ins(%560, %565 : tensor<1x256x160xf32>, tensor<1x160x160xf32>) outs(%562 : tensor<1x256x160xf32>) -> tensor<1x256x160xf32> | |
%567 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_115, %566 : tensor<160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%expanded_281 = tensor.expand_shape %567 [[0], [1], [2, 3]] output_shape [1, 256, 5, 32] : tensor<1x256x160xf32> into tensor<1x256x5x32xf32> | |
%568 = tensor.empty() : tensor<1x5x256x32xf32> | |
%569 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_281 : tensor<1x256x5x32xf32>) outs(%568 : tensor<1x5x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x256x32xf32> | |
%570 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_280 : tensor<1x256x5x32xf32>) outs(%568 : tensor<1x5x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x256x32xf32> | |
%571 = tensor.empty() : tensor<1x5x32x256xf32> | |
%572 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%570 : tensor<1x5x256x32xf32>) outs(%571 : tensor<1x5x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x32x256xf32> | |
%573 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%539 : tensor<1x5x1024x32xf32>) outs(%538 : tensor<1x5x1024x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x1024x32xf32> | |
%574 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%572 : tensor<1x5x32x256xf32>) outs(%571 : tensor<1x5x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x32x256xf32> | |
%collapsed_282 = tensor.collapse_shape %573 [[0, 1], [2], [3]] : tensor<1x5x1024x32xf32> into tensor<5x1024x32xf32> | |
%collapsed_283 = tensor.collapse_shape %574 [[0, 1], [2], [3]] : tensor<1x5x32x256xf32> into tensor<5x32x256xf32> | |
%575 = tensor.empty() : tensor<5x1024x256xf32> | |
%576 = linalg.fill ins(%cst_0 : f32) outs(%575 : tensor<5x1024x256xf32>) -> tensor<5x1024x256xf32> | |
%577 = linalg.batch_matmul ins(%collapsed_282, %collapsed_283 : tensor<5x1024x32xf32>, tensor<5x32x256xf32>) outs(%576 : tensor<5x1024x256xf32>) -> tensor<5x1024x256xf32> | |
%expanded_284 = tensor.expand_shape %577 [[0, 1], [2], [3]] output_shape [1, 5, 1024, 256] : tensor<5x1024x256xf32> into tensor<1x5x1024x256xf32> | |
%578 = tensor.empty() : tensor<1x5x1024x256xf32> | |
%579 = linalg.generic {indexing_maps = [#map18, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_284, %cst_200 : tensor<1x5x1024x256xf32>, tensor<f32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%580 = tensor.empty() : tensor<1x5x1024xi64> | |
%581 = linalg.fill ins(%c0_i64 : i64) outs(%580 : tensor<1x5x1024xi64>) -> tensor<1x5x1024xi64> | |
%582 = tensor.empty() : tensor<1x5x1024xf32> | |
%583 = linalg.fill ins(%cst_1 : f32) outs(%582 : tensor<1x5x1024xf32>) -> tensor<1x5x1024xf32> | |
%584:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%579 : tensor<1x5x1024x256xf32>) outs(%583, %581 : tensor<1x5x1024xf32>, tensor<1x5x1024xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x5x1024xf32>, tensor<1x5x1024xi64>) | |
%expanded_285 = tensor.expand_shape %584#0 [[0], [1], [2, 3]] output_shape [1, 5, 1024, 1] : tensor<1x5x1024xf32> into tensor<1x5x1024x1xf32> | |
%585 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%579, %expanded_285 : tensor<1x5x1024x256xf32>, tensor<1x5x1024x1xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%586 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%585 : tensor<1x5x1024x256xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%587 = tensor.empty() : tensor<1x5x1024x1xf32> | |
%588 = linalg.fill ins(%cst_0 : f32) outs(%587 : tensor<1x5x1024x1xf32>) -> tensor<1x5x1024x1xf32> | |
%589 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%586 : tensor<1x5x1024x256xf32>) outs(%588 : tensor<1x5x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x1xf32> | |
%590 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%586, %589 : tensor<1x5x1024x256xf32>, tensor<1x5x1024x1xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%591 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%590 : tensor<1x5x1024x256xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%592 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%569 : tensor<1x5x256x32xf32>) outs(%568 : tensor<1x5x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x256x32xf32> | |
%collapsed_286 = tensor.collapse_shape %591 [[0, 1], [2], [3]] : tensor<1x5x1024x256xf32> into tensor<5x1024x256xf32> | |
%collapsed_287 = tensor.collapse_shape %592 [[0, 1], [2], [3]] : tensor<1x5x256x32xf32> into tensor<5x256x32xf32> | |
%593 = tensor.empty() : tensor<5x1024x32xf32> | |
%594 = linalg.fill ins(%cst_0 : f32) outs(%593 : tensor<5x1024x32xf32>) -> tensor<5x1024x32xf32> | |
%595 = linalg.batch_matmul ins(%collapsed_286, %collapsed_287 : tensor<5x1024x256xf32>, tensor<5x256x32xf32>) outs(%594 : tensor<5x1024x32xf32>) -> tensor<5x1024x32xf32> | |
%expanded_288 = tensor.expand_shape %595 [[0, 1], [2], [3]] output_shape [1, 5, 1024, 32] : tensor<5x1024x32xf32> into tensor<1x5x1024x32xf32> | |
%596 = tensor.empty() : tensor<1x1024x5x32xf32> | |
%597 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_288 : tensor<1x5x1024x32xf32>) outs(%596 : tensor<1x1024x5x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x5x32xf32> | |
%collapsed_289 = tensor.collapse_shape %597 [[0], [1], [2, 3]] : tensor<1x1024x5x32xf32> into tensor<1x1024x160xf32> | |
%598 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_289 : tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%599 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_26 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%600 = linalg.batch_matmul ins(%598, %599 : tensor<1x1024x160xf32>, tensor<1x160x160xf32>) outs(%535 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%601 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_110, %600 : tensor<160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%602 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%601, %518 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%603 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%602 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%604 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%603 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%605 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%604 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%606 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%602, %605 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%607 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%606, %606 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%608 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%607 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%609 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%608 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%610 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%609 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%611 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%610 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%612 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%611 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%613 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%606, %612 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%614 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%613, %cst_109 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%615 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%614, %cst_108 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%616 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%615 : tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%617 = tensor.empty() : tensor<1x160x640xf32> | |
%618 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_25 : tensor<160x640xf32>) outs(%617 : tensor<1x160x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x640xf32> | |
%619 = tensor.empty() : tensor<1x1024x640xf32> | |
%620 = linalg.fill ins(%cst_0 : f32) outs(%619 : tensor<1x1024x640xf32>) -> tensor<1x1024x640xf32> | |
%621 = linalg.batch_matmul ins(%616, %618 : tensor<1x1024x160xf32>, tensor<1x160x640xf32>) outs(%620 : tensor<1x1024x640xf32>) -> tensor<1x1024x640xf32> | |
%622 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_107, %621 : tensor<640xf32>, tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%623 = tensor.empty() : tensor<1x640x1024xf32> | |
%624 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%622 : tensor<1x1024x640xf32>) outs(%623 : tensor<1x640x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x640x1024xf32> | |
%expanded_290 = tensor.expand_shape %624 [[0], [1], [2, 3]] output_shape [1, 640, 32, 32] : tensor<1x640x1024xf32> into tensor<1x640x32x32xf32> | |
%padded_291 = tensor.pad %expanded_290 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x640x32x32xf32> to tensor<1x640x34x34xf32> | |
%625 = tensor.empty() : tensor<1x640x32x32xf32> | |
%626 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_105 : tensor<640xf32>) outs(%625 : tensor<1x640x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x640x32x32xf32> | |
%collapsed_292 = tensor.collapse_shape %cst_106 [[0, 1], [2], [3]] : tensor<640x1x3x3xf32> into tensor<640x3x3xf32> | |
%627 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_291, %collapsed_292 : tensor<1x640x34x34xf32>, tensor<640x3x3xf32>) outs(%626 : tensor<1x640x32x32xf32>) -> tensor<1x640x32x32xf32> | |
%collapsed_293 = tensor.collapse_shape %627 [[0], [1], [2, 3]] : tensor<1x640x32x32xf32> into tensor<1x640x1024xf32> | |
%628 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_293 : tensor<1x640x1024xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x640xf32> | |
%629 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%628, %cst_201 : tensor<1x1024x640xf32>, tensor<f32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%630 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%629 : tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%631 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%630, %cst_202 : tensor<1x1024x640xf32>, tensor<f32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%632 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%628, %631 : tensor<1x1024x640xf32>, tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%633 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%632, %cst_203 : tensor<1x1024x640xf32>, tensor<f32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%634 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%633 : tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x640xf32> | |
%635 = tensor.empty() : tensor<1x640x160xf32> | |
%636 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_24 : tensor<640x160xf32>) outs(%635 : tensor<1x640x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x640x160xf32> | |
%637 = linalg.batch_matmul ins(%634, %636 : tensor<1x1024x640xf32>, tensor<1x640x160xf32>) outs(%535 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%638 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_104, %637 : tensor<160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%639 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%638, %602 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%640 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%639 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%641 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%640 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%642 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%641 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%643 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%639, %642 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%644 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%643, %643 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%645 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%644 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%646 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%645 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%647 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%646 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%648 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%647 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%649 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%648 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%650 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%643, %649 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%651 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%650, %cst_103 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%652 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%651, %cst_102 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%653 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%652 : tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%654 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_23 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%655 = linalg.batch_matmul ins(%653, %654 : tensor<1x1024x160xf32>, tensor<1x160x160xf32>) outs(%535 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%656 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_101, %655 : tensor<160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%expanded_294 = tensor.expand_shape %656 [[0], [1], [2, 3]] output_shape [1, 1024, 5, 32] : tensor<1x1024x160xf32> into tensor<1x1024x5x32xf32> | |
%657 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_294 : tensor<1x1024x5x32xf32>) outs(%538 : tensor<1x5x1024x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x1024x32xf32> | |
%658 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%652 : tensor<1x1024x160xf32>) outs(%540 : tensor<1x160x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x1024xf32> | |
%expanded_295 = tensor.expand_shape %658 [[0], [1], [2, 3]] output_shape [1, 160, 32, 32] : tensor<1x160x1024xf32> into tensor<1x160x32x32xf32> | |
%659 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_97 : tensor<160xf32>) outs(%542 : tensor<1x160x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x16x16xf32> | |
%660 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%expanded_295, %cst_98 : tensor<1x160x32x32xf32>, tensor<160x160x2x2xf32>) outs(%659 : tensor<1x160x16x16xf32>) -> tensor<1x160x16x16xf32> | |
%collapsed_296 = tensor.collapse_shape %660 [[0], [1], [2, 3]] : tensor<1x160x16x16xf32> into tensor<1x160x256xf32> | |
%661 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_296 : tensor<1x160x256xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%662 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%661 : tensor<1x256x160xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%663 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%662 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%664 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%663 : tensor<1x256x1xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%665 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%661, %664 : tensor<1x256x160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%666 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%665, %665 : tensor<1x256x160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%667 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%666 : tensor<1x256x160xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%668 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%667 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%669 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%668 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%670 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%669 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%671 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%670 : tensor<1x256x1xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%672 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%665, %671 : tensor<1x256x160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%673 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%672, %cst_96 : tensor<1x256x160xf32>, tensor<160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%674 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%673, %cst_95 : tensor<1x256x160xf32>, tensor<160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%675 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%674 : tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x160xf32> | |
%676 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_22 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%677 = linalg.batch_matmul ins(%675, %676 : tensor<1x256x160xf32>, tensor<1x160x160xf32>) outs(%562 : tensor<1x256x160xf32>) -> tensor<1x256x160xf32> | |
%678 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_100, %677 : tensor<160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%expanded_297 = tensor.expand_shape %678 [[0], [1], [2, 3]] output_shape [1, 256, 5, 32] : tensor<1x256x160xf32> into tensor<1x256x5x32xf32> | |
%679 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_21 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%680 = linalg.batch_matmul ins(%675, %679 : tensor<1x256x160xf32>, tensor<1x160x160xf32>) outs(%562 : tensor<1x256x160xf32>) -> tensor<1x256x160xf32> | |
%681 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_99, %680 : tensor<160xf32>, tensor<1x256x160xf32>) outs(%545 : tensor<1x256x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x160xf32> | |
%expanded_298 = tensor.expand_shape %681 [[0], [1], [2, 3]] output_shape [1, 256, 5, 32] : tensor<1x256x160xf32> into tensor<1x256x5x32xf32> | |
%682 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_298 : tensor<1x256x5x32xf32>) outs(%568 : tensor<1x5x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x256x32xf32> | |
%683 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_297 : tensor<1x256x5x32xf32>) outs(%568 : tensor<1x5x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x256x32xf32> | |
%684 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%683 : tensor<1x5x256x32xf32>) outs(%571 : tensor<1x5x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x32x256xf32> | |
%685 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%657 : tensor<1x5x1024x32xf32>) outs(%538 : tensor<1x5x1024x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x1024x32xf32> | |
%686 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%684 : tensor<1x5x32x256xf32>) outs(%571 : tensor<1x5x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x32x256xf32> | |
%collapsed_299 = tensor.collapse_shape %685 [[0, 1], [2], [3]] : tensor<1x5x1024x32xf32> into tensor<5x1024x32xf32> | |
%collapsed_300 = tensor.collapse_shape %686 [[0, 1], [2], [3]] : tensor<1x5x32x256xf32> into tensor<5x32x256xf32> | |
%687 = linalg.batch_matmul ins(%collapsed_299, %collapsed_300 : tensor<5x1024x32xf32>, tensor<5x32x256xf32>) outs(%576 : tensor<5x1024x256xf32>) -> tensor<5x1024x256xf32> | |
%expanded_301 = tensor.expand_shape %687 [[0, 1], [2], [3]] output_shape [1, 5, 1024, 256] : tensor<5x1024x256xf32> into tensor<1x5x1024x256xf32> | |
%688 = linalg.generic {indexing_maps = [#map18, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_301, %cst_200 : tensor<1x5x1024x256xf32>, tensor<f32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%689:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%688 : tensor<1x5x1024x256xf32>) outs(%583, %581 : tensor<1x5x1024xf32>, tensor<1x5x1024xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x5x1024xf32>, tensor<1x5x1024xi64>) | |
%expanded_302 = tensor.expand_shape %689#0 [[0], [1], [2, 3]] output_shape [1, 5, 1024, 1] : tensor<1x5x1024xf32> into tensor<1x5x1024x1xf32> | |
%690 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%688, %expanded_302 : tensor<1x5x1024x256xf32>, tensor<1x5x1024x1xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%691 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%690 : tensor<1x5x1024x256xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%692 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%691 : tensor<1x5x1024x256xf32>) outs(%588 : tensor<1x5x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x1xf32> | |
%693 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%691, %692 : tensor<1x5x1024x256xf32>, tensor<1x5x1024x1xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%694 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%693 : tensor<1x5x1024x256xf32>) outs(%578 : tensor<1x5x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x1024x256xf32> | |
%695 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%682 : tensor<1x5x256x32xf32>) outs(%568 : tensor<1x5x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x5x256x32xf32> | |
%collapsed_303 = tensor.collapse_shape %694 [[0, 1], [2], [3]] : tensor<1x5x1024x256xf32> into tensor<5x1024x256xf32> | |
%collapsed_304 = tensor.collapse_shape %695 [[0, 1], [2], [3]] : tensor<1x5x256x32xf32> into tensor<5x256x32xf32> | |
%696 = linalg.batch_matmul ins(%collapsed_303, %collapsed_304 : tensor<5x1024x256xf32>, tensor<5x256x32xf32>) outs(%594 : tensor<5x1024x32xf32>) -> tensor<5x1024x32xf32> | |
%expanded_305 = tensor.expand_shape %696 [[0, 1], [2], [3]] output_shape [1, 5, 1024, 32] : tensor<5x1024x32xf32> into tensor<1x5x1024x32xf32> | |
%697 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_305 : tensor<1x5x1024x32xf32>) outs(%596 : tensor<1x1024x5x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x5x32xf32> | |
%collapsed_306 = tensor.collapse_shape %697 [[0], [1], [2, 3]] : tensor<1x1024x5x32xf32> into tensor<1x1024x160xf32> | |
%698 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_306 : tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%699 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_20 : tensor<160x160xf32>) outs(%533 : tensor<1x160x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x160xf32> | |
%700 = linalg.batch_matmul ins(%698, %699 : tensor<1x1024x160xf32>, tensor<1x160x160xf32>) outs(%535 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%701 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_94, %700 : tensor<160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%702 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%701, %639 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%703 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%702 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%704 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%703 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%705 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%704 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%706 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%702, %705 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%707 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%706, %706 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%708 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%707 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%709 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%708 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%710 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%709 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%711 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%710 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%712 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%711 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%713 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%706, %712 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%714 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%713, %cst_93 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%715 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%714, %cst_92 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%716 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%715 : tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%717 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_19 : tensor<160x640xf32>) outs(%617 : tensor<1x160x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x640xf32> | |
%718 = linalg.batch_matmul ins(%716, %717 : tensor<1x1024x160xf32>, tensor<1x160x640xf32>) outs(%620 : tensor<1x1024x640xf32>) -> tensor<1x1024x640xf32> | |
%719 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_91, %718 : tensor<640xf32>, tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%720 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%719 : tensor<1x1024x640xf32>) outs(%623 : tensor<1x640x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x640x1024xf32> | |
%expanded_307 = tensor.expand_shape %720 [[0], [1], [2, 3]] output_shape [1, 640, 32, 32] : tensor<1x640x1024xf32> into tensor<1x640x32x32xf32> | |
%padded_308 = tensor.pad %expanded_307 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x640x32x32xf32> to tensor<1x640x34x34xf32> | |
%721 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_89 : tensor<640xf32>) outs(%625 : tensor<1x640x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x640x32x32xf32> | |
%collapsed_309 = tensor.collapse_shape %cst_90 [[0, 1], [2], [3]] : tensor<640x1x3x3xf32> into tensor<640x3x3xf32> | |
%722 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_308, %collapsed_309 : tensor<1x640x34x34xf32>, tensor<640x3x3xf32>) outs(%721 : tensor<1x640x32x32xf32>) -> tensor<1x640x32x32xf32> | |
%collapsed_310 = tensor.collapse_shape %722 [[0], [1], [2, 3]] : tensor<1x640x32x32xf32> into tensor<1x640x1024xf32> | |
%723 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_310 : tensor<1x640x1024xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x640xf32> | |
%724 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%723, %cst_201 : tensor<1x1024x640xf32>, tensor<f32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%725 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%724 : tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%726 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%725, %cst_202 : tensor<1x1024x640xf32>, tensor<f32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%727 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%723, %726 : tensor<1x1024x640xf32>, tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%728 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%727, %cst_203 : tensor<1x1024x640xf32>, tensor<f32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x640xf32> | |
%729 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%728 : tensor<1x1024x640xf32>) outs(%619 : tensor<1x1024x640xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x640xf32> | |
%730 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_18 : tensor<640x160xf32>) outs(%635 : tensor<1x640x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x640x160xf32> | |
%731 = linalg.batch_matmul ins(%729, %730 : tensor<1x1024x640xf32>, tensor<1x640x160xf32>) outs(%535 : tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32> | |
%732 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_88, %731 : tensor<160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%733 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%732, %702 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%734 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%733 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%735 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%734 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%736 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%735 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%737 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%733, %736 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%738 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%737, %737 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%739 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%738 : tensor<1x1024x160xf32>) outs(%505 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%740 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%739 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_5 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%741 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%740 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%742 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%741 : tensor<1x1024x1xf32>) outs(%504 : tensor<1x1024x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x1xf32> | |
%743 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%742 : tensor<1x1024x1xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x160xf32> | |
%744 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%737, %743 : tensor<1x1024x160xf32>, tensor<1x1024x160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%745 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%744, %cst_59 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%746 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%745, %cst_58 : tensor<1x1024x160xf32>, tensor<160xf32>) outs(%502 : tensor<1x1024x160xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1024x160xf32> | |
%expanded_311 = tensor.expand_shape %746 [[0], [1, 2], [3]] output_shape [1, 32, 32, 160] : tensor<1x1024x160xf32> into tensor<1x32x32x160xf32> | |
%747 = linalg.generic {indexing_maps = [#map1, #map17], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_311 : tensor<1x32x32x160xf32>) outs(%499 : tensor<1x160x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x32x32xf32> | |
%748 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%747 : tensor<1x160x32x32xf32>) outs(%499 : tensor<1x160x32x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x160x32x32xf32> | |
%padded_312 = tensor.pad %748 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x160x32x32xf32> to tensor<1x160x34x34xf32> | |
%749 = tensor.empty() : tensor<1x256x16x16xf32> | |
%750 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_186 : tensor<256xf32>) outs(%749 : tensor<1x256x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x16x16xf32> | |
%751 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_312, %cst_187 : tensor<1x160x34x34xf32>, tensor<256x160x3x3xf32>) outs(%750 : tensor<1x256x16x16xf32>) -> tensor<1x256x16x16xf32> | |
%collapsed_313 = tensor.collapse_shape %751 [[0], [1], [2, 3]] : tensor<1x256x16x16xf32> into tensor<1x256x256xf32> | |
%752 = tensor.empty() : tensor<1x256x256xf32> | |
%753 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_313 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%754 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%753 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%755 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%754 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%756 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%755 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%757 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%753, %756 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%758 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%757, %757 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%759 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%758 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%760 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%759 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%761 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%760 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%762 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%761 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%763 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%762 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%764 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%757, %763 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%765 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%764, %cst_185 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%766 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%765, %cst_184 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%767 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%766 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%768 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%767 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%769 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%768 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%770 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%766, %769 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%771 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%770, %770 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%772 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%771 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%773 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%772 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%774 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%773 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%775 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%774 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%776 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%775 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%777 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%770, %776 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%778 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%777, %cst_87 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%779 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%778, %cst_86 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%780 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%779 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%781 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_17 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%782 = linalg.fill ins(%cst_0 : f32) outs(%752 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%783 = linalg.batch_matmul ins(%780, %781 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%784 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_85, %783 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%expanded_314 = tensor.expand_shape %784 [[0], [1], [2, 3]] output_shape [1, 256, 8, 32] : tensor<1x256x256xf32> into tensor<1x256x8x32xf32> | |
%785 = tensor.empty() : tensor<1x8x256x32xf32> | |
%786 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_314 : tensor<1x256x8x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%787 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_16 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%788 = linalg.batch_matmul ins(%780, %787 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%789 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_84, %788 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%expanded_315 = tensor.expand_shape %789 [[0], [1], [2, 3]] output_shape [1, 256, 8, 32] : tensor<1x256x256xf32> into tensor<1x256x8x32xf32> | |
%790 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_15 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%791 = linalg.batch_matmul ins(%780, %790 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%792 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_83, %791 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%expanded_316 = tensor.expand_shape %792 [[0], [1], [2, 3]] output_shape [1, 256, 8, 32] : tensor<1x256x256xf32> into tensor<1x256x8x32xf32> | |
%793 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_316 : tensor<1x256x8x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%794 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_315 : tensor<1x256x8x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%795 = tensor.empty() : tensor<1x8x32x256xf32> | |
%796 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%794 : tensor<1x8x256x32xf32>) outs(%795 : tensor<1x8x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x32x256xf32> | |
%797 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%786 : tensor<1x8x256x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%798 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%796 : tensor<1x8x32x256xf32>) outs(%795 : tensor<1x8x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x32x256xf32> | |
%collapsed_317 = tensor.collapse_shape %797 [[0, 1], [2], [3]] : tensor<1x8x256x32xf32> into tensor<8x256x32xf32> | |
%collapsed_318 = tensor.collapse_shape %798 [[0, 1], [2], [3]] : tensor<1x8x32x256xf32> into tensor<8x32x256xf32> | |
%799 = tensor.empty() : tensor<8x256x256xf32> | |
%800 = linalg.fill ins(%cst_0 : f32) outs(%799 : tensor<8x256x256xf32>) -> tensor<8x256x256xf32> | |
%801 = linalg.batch_matmul ins(%collapsed_317, %collapsed_318 : tensor<8x256x32xf32>, tensor<8x32x256xf32>) outs(%800 : tensor<8x256x256xf32>) -> tensor<8x256x256xf32> | |
%expanded_319 = tensor.expand_shape %801 [[0, 1], [2], [3]] output_shape [1, 8, 256, 256] : tensor<8x256x256xf32> into tensor<1x8x256x256xf32> | |
%802 = tensor.empty() : tensor<1x8x256x256xf32> | |
%803 = linalg.generic {indexing_maps = [#map18, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_319, %cst_200 : tensor<1x8x256x256xf32>, tensor<f32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%804 = tensor.empty() : tensor<1x8x256xi64> | |
%805 = linalg.fill ins(%c0_i64 : i64) outs(%804 : tensor<1x8x256xi64>) -> tensor<1x8x256xi64> | |
%806 = tensor.empty() : tensor<1x8x256xf32> | |
%807 = linalg.fill ins(%cst_1 : f32) outs(%806 : tensor<1x8x256xf32>) -> tensor<1x8x256xf32> | |
%808:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%803 : tensor<1x8x256x256xf32>) outs(%807, %805 : tensor<1x8x256xf32>, tensor<1x8x256xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x8x256xf32>, tensor<1x8x256xi64>) | |
%expanded_320 = tensor.expand_shape %808#0 [[0], [1], [2, 3]] output_shape [1, 8, 256, 1] : tensor<1x8x256xf32> into tensor<1x8x256x1xf32> | |
%809 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%803, %expanded_320 : tensor<1x8x256x256xf32>, tensor<1x8x256x1xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%810 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%809 : tensor<1x8x256x256xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%811 = tensor.empty() : tensor<1x8x256x1xf32> | |
%812 = linalg.fill ins(%cst_0 : f32) outs(%811 : tensor<1x8x256x1xf32>) -> tensor<1x8x256x1xf32> | |
%813 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%810 : tensor<1x8x256x256xf32>) outs(%812 : tensor<1x8x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x1xf32> | |
%814 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%810, %813 : tensor<1x8x256x256xf32>, tensor<1x8x256x1xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%815 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%814 : tensor<1x8x256x256xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x256xf32> | |
%816 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%793 : tensor<1x8x256x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%collapsed_321 = tensor.collapse_shape %815 [[0, 1], [2], [3]] : tensor<1x8x256x256xf32> into tensor<8x256x256xf32> | |
%collapsed_322 = tensor.collapse_shape %816 [[0, 1], [2], [3]] : tensor<1x8x256x32xf32> into tensor<8x256x32xf32> | |
%817 = tensor.empty() : tensor<8x256x32xf32> | |
%818 = linalg.fill ins(%cst_0 : f32) outs(%817 : tensor<8x256x32xf32>) -> tensor<8x256x32xf32> | |
%819 = linalg.batch_matmul ins(%collapsed_321, %collapsed_322 : tensor<8x256x256xf32>, tensor<8x256x32xf32>) outs(%818 : tensor<8x256x32xf32>) -> tensor<8x256x32xf32> | |
%expanded_323 = tensor.expand_shape %819 [[0, 1], [2], [3]] output_shape [1, 8, 256, 32] : tensor<8x256x32xf32> into tensor<1x8x256x32xf32> | |
%820 = tensor.empty() : tensor<1x256x8x32xf32> | |
%821 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_323 : tensor<1x8x256x32xf32>) outs(%820 : tensor<1x256x8x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x8x32xf32> | |
%collapsed_324 = tensor.collapse_shape %821 [[0], [1], [2, 3]] : tensor<1x256x8x32xf32> into tensor<1x256x256xf32> | |
%822 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_324 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%823 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_14 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%824 = linalg.batch_matmul ins(%822, %823 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%825 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_82, %824 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%826 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%825, %766 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%827 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%826 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%828 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%827 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%829 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%828 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%830 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%826, %829 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%831 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%830, %830 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%832 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%831 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%833 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%832 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%834 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%833 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%835 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%834 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%836 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%835 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%837 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%830, %836 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%838 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%837, %cst_81 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%839 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%838, %cst_80 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%840 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%839 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%841 = tensor.empty() : tensor<1x256x1024xf32> | |
%842 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_13 : tensor<256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x1024xf32> | |
%843 = linalg.fill ins(%cst_0 : f32) outs(%841 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32> | |
%844 = linalg.batch_matmul ins(%840, %842 : tensor<1x256x256xf32>, tensor<1x256x1024xf32>) outs(%843 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32> | |
%845 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_79, %844 : tensor<1024xf32>, tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%846 = tensor.empty() : tensor<1x1024x256xf32> | |
%847 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%845 : tensor<1x256x1024xf32>) outs(%846 : tensor<1x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x256xf32> | |
%expanded_325 = tensor.expand_shape %847 [[0], [1], [2, 3]] output_shape [1, 1024, 16, 16] : tensor<1x1024x256xf32> into tensor<1x1024x16x16xf32> | |
%padded_326 = tensor.pad %expanded_325 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x1024x16x16xf32> to tensor<1x1024x18x18xf32> | |
%848 = tensor.empty() : tensor<1x1024x16x16xf32> | |
%849 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_77 : tensor<1024xf32>) outs(%848 : tensor<1x1024x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x16x16xf32> | |
%collapsed_327 = tensor.collapse_shape %cst_78 [[0, 1], [2], [3]] : tensor<1024x1x3x3xf32> into tensor<1024x3x3xf32> | |
%850 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_326, %collapsed_327 : tensor<1x1024x18x18xf32>, tensor<1024x3x3xf32>) outs(%849 : tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32> | |
%collapsed_328 = tensor.collapse_shape %850 [[0], [1], [2, 3]] : tensor<1x1024x16x16xf32> into tensor<1x1024x256xf32> | |
%851 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_328 : tensor<1x1024x256xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x1024xf32> | |
%852 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%851, %cst_201 : tensor<1x256x1024xf32>, tensor<f32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%853 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%852 : tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%854 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%853, %cst_202 : tensor<1x256x1024xf32>, tensor<f32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%855 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%851, %854 : tensor<1x256x1024xf32>, tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%856 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%855, %cst_203 : tensor<1x256x1024xf32>, tensor<f32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%857 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%856 : tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x1024xf32> | |
%858 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_12 : tensor<1024x256xf32>) outs(%846 : tensor<1x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x256xf32> | |
%859 = linalg.batch_matmul ins(%857, %858 : tensor<1x256x1024xf32>, tensor<1x1024x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%860 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_76, %859 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%861 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%860, %826 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%862 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%861 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%863 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%862 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%864 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%863 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%865 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%861, %864 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%866 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%865, %865 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%867 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%866 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%868 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%867 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%869 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%868 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%870 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%869 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%871 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%870 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%872 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%865, %871 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%873 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%872, %cst_75 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%874 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%873, %cst_74 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%875 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%874 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%876 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_11 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%877 = linalg.batch_matmul ins(%875, %876 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%878 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_73, %877 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%expanded_329 = tensor.expand_shape %878 [[0], [1], [2, 3]] output_shape [1, 256, 8, 32] : tensor<1x256x256xf32> into tensor<1x256x8x32xf32> | |
%879 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_329 : tensor<1x256x8x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%880 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_10 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%881 = linalg.batch_matmul ins(%875, %880 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%882 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_72, %881 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%expanded_330 = tensor.expand_shape %882 [[0], [1], [2, 3]] output_shape [1, 256, 8, 32] : tensor<1x256x256xf32> into tensor<1x256x8x32xf32> | |
%883 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_9 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%884 = linalg.batch_matmul ins(%875, %883 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%885 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_71, %884 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%expanded_331 = tensor.expand_shape %885 [[0], [1], [2, 3]] output_shape [1, 256, 8, 32] : tensor<1x256x256xf32> into tensor<1x256x8x32xf32> | |
%886 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_331 : tensor<1x256x8x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%887 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_330 : tensor<1x256x8x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%888 = linalg.generic {indexing_maps = [#map1, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%887 : tensor<1x8x256x32xf32>) outs(%795 : tensor<1x8x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x32x256xf32> | |
%889 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%879 : tensor<1x8x256x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%890 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%888 : tensor<1x8x32x256xf32>) outs(%795 : tensor<1x8x32x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x32x256xf32> | |
%collapsed_332 = tensor.collapse_shape %889 [[0, 1], [2], [3]] : tensor<1x8x256x32xf32> into tensor<8x256x32xf32> | |
%collapsed_333 = tensor.collapse_shape %890 [[0, 1], [2], [3]] : tensor<1x8x32x256xf32> into tensor<8x32x256xf32> | |
%891 = linalg.batch_matmul ins(%collapsed_332, %collapsed_333 : tensor<8x256x32xf32>, tensor<8x32x256xf32>) outs(%800 : tensor<8x256x256xf32>) -> tensor<8x256x256xf32> | |
%expanded_334 = tensor.expand_shape %891 [[0, 1], [2], [3]] output_shape [1, 8, 256, 256] : tensor<8x256x256xf32> into tensor<1x8x256x256xf32> | |
%892 = linalg.generic {indexing_maps = [#map18, #map12, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_334, %cst_200 : tensor<1x8x256x256xf32>, tensor<f32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%893:2 = linalg.generic {indexing_maps = [#map1, #map13, #map13], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%892 : tensor<1x8x256x256xf32>) outs(%807, %805 : tensor<1x8x256xf32>, tensor<1x8x256xi64>) { | |
^bb0(%in: f32, %out: f32, %out_344: i64): | |
%961 = linalg.index 3 : index | |
%962 = arith.index_cast %961 : index to i64 | |
%963 = arith.maximumf %in, %out : f32 | |
%964 = arith.cmpf ogt, %in, %out : f32 | |
%965 = arith.select %964, %962, %out_344 : i64 | |
linalg.yield %963, %965 : f32, i64 | |
} -> (tensor<1x8x256xf32>, tensor<1x8x256xi64>) | |
%expanded_335 = tensor.expand_shape %893#0 [[0], [1], [2, 3]] output_shape [1, 8, 256, 1] : tensor<1x8x256xf32> into tensor<1x8x256x1xf32> | |
%894 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%892, %expanded_335 : tensor<1x8x256x256xf32>, tensor<1x8x256x1xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%895 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%894 : tensor<1x8x256x256xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.exp %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%896 = linalg.generic {indexing_maps = [#map1, #map15], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%895 : tensor<1x8x256x256xf32>) outs(%812 : tensor<1x8x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x1xf32> | |
%897 = linalg.generic {indexing_maps = [#map18, #map19, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%895, %896 : tensor<1x8x256x256xf32>, tensor<1x8x256x1xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x8x256x256xf32> | |
%898 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%897 : tensor<1x8x256x256xf32>) outs(%802 : tensor<1x8x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x256xf32> | |
%899 = linalg.generic {indexing_maps = [#map18, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%886 : tensor<1x8x256x32xf32>) outs(%785 : tensor<1x8x256x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x8x256x32xf32> | |
%collapsed_336 = tensor.collapse_shape %898 [[0, 1], [2], [3]] : tensor<1x8x256x256xf32> into tensor<8x256x256xf32> | |
%collapsed_337 = tensor.collapse_shape %899 [[0, 1], [2], [3]] : tensor<1x8x256x32xf32> into tensor<8x256x32xf32> | |
%900 = linalg.batch_matmul ins(%collapsed_336, %collapsed_337 : tensor<8x256x256xf32>, tensor<8x256x32xf32>) outs(%818 : tensor<8x256x32xf32>) -> tensor<8x256x32xf32> | |
%expanded_338 = tensor.expand_shape %900 [[0, 1], [2], [3]] output_shape [1, 8, 256, 32] : tensor<8x256x32xf32> into tensor<1x8x256x32xf32> | |
%901 = linalg.generic {indexing_maps = [#map1, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_338 : tensor<1x8x256x32xf32>) outs(%820 : tensor<1x256x8x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x8x32xf32> | |
%collapsed_339 = tensor.collapse_shape %901 [[0], [1], [2, 3]] : tensor<1x256x8x32xf32> into tensor<1x256x256xf32> | |
%902 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_339 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%903 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_8 : tensor<256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%904 = linalg.batch_matmul ins(%902, %903 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%905 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_70, %904 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%906 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%905, %861 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%907 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%906 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%908 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%907 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%909 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%908 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%910 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%906, %909 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%911 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%910, %910 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%912 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%911 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%913 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%912 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%914 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%913 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%915 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%914 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%916 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%915 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%917 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%910, %916 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%918 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%917, %cst_69 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%919 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%918, %cst_68 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%920 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%919 : tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%921 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_7 : tensor<256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x1024xf32> | |
%922 = linalg.batch_matmul ins(%920, %921 : tensor<1x256x256xf32>, tensor<1x256x1024xf32>) outs(%843 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32> | |
%923 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_67, %922 : tensor<1024xf32>, tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%924 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%923 : tensor<1x256x1024xf32>) outs(%846 : tensor<1x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x256xf32> | |
%expanded_340 = tensor.expand_shape %924 [[0], [1], [2, 3]] output_shape [1, 1024, 16, 16] : tensor<1x1024x256xf32> into tensor<1x1024x16x16xf32> | |
%padded_341 = tensor.pad %expanded_340 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
tensor.yield %cst_0 : f32 | |
} : tensor<1x1024x16x16xf32> to tensor<1x1024x18x18xf32> | |
%925 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_65 : tensor<1024xf32>) outs(%848 : tensor<1x1024x16x16xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x16x16xf32> | |
%collapsed_342 = tensor.collapse_shape %cst_66 [[0, 1], [2], [3]] : tensor<1024x1x3x3xf32> into tensor<1024x3x3xf32> | |
%926 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_341, %collapsed_342 : tensor<1x1024x18x18xf32>, tensor<1024x3x3xf32>) outs(%925 : tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32> | |
%collapsed_343 = tensor.collapse_shape %926 [[0], [1], [2, 3]] : tensor<1x1024x16x16xf32> into tensor<1x1024x256xf32> | |
%927 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_343 : tensor<1x1024x256xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x1024xf32> | |
%928 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%927, %cst_201 : tensor<1x256x1024xf32>, tensor<f32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.divf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%929 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%928 : tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.erf %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%930 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%929, %cst_202 : tensor<1x256x1024xf32>, tensor<f32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%931 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%927, %930 : tensor<1x256x1024xf32>, tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%932 = linalg.generic {indexing_maps = [#map6, #map16, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%931, %cst_203 : tensor<1x256x1024xf32>, tensor<f32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1024xf32> | |
%933 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%932 : tensor<1x256x1024xf32>) outs(%841 : tensor<1x256x1024xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x1024xf32> | |
%934 = linalg.generic {indexing_maps = [#map8, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst : tensor<1024x256xf32>) outs(%846 : tensor<1x1024x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x1024x256xf32> | |
%935 = linalg.batch_matmul ins(%933, %934 : tensor<1x256x1024xf32>, tensor<1x1024x256xf32>) outs(%782 : tensor<1x256x256xf32>) -> tensor<1x256x256xf32> | |
%936 = linalg.generic {indexing_maps = [#map7, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_64, %935 : tensor<256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%937 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%936, %906 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%938 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%937 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%939 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%938 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%940 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%939 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%941 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%937, %940 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.subf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%942 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%941, %941 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%943 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%942 : tensor<1x256x256xf32>) outs(%49 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%944 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%943 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%945 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%944 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %cst_3 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%946 = linalg.generic {indexing_maps = [#map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%945 : tensor<1x256x1xf32>) outs(%48 : tensor<1x256x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = math.rsqrt %in : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x1xf32> | |
%947 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%946 : tensor<1x256x1xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<1x256x256xf32> | |
%948 = linalg.generic {indexing_maps = [#map6, #map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%941, %947 : tensor<1x256x256xf32>, tensor<1x256x256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%949 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%948, %cst_57 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.mulf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%950 = linalg.generic {indexing_maps = [#map6, #map7, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%949, %cst_56 : tensor<1x256x256xf32>, tensor<256xf32>) outs(%752 : tensor<1x256x256xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256x256xf32> | |
%951 = tensor.empty() : tensor<1x256xf32> | |
%952 = linalg.fill ins(%cst_0 : f32) outs(%951 : tensor<1x256xf32>) -> tensor<1x256xf32> | |
%953 = linalg.generic {indexing_maps = [#map2, #map20], iterator_types = ["parallel", "reduction", "parallel"]} ins(%950 : tensor<1x256x256xf32>) outs(%952 : tensor<1x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.addf %in, %out : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256xf32> | |
%954 = linalg.generic {indexing_maps = [#map21, #map22], iterator_types = ["parallel", "parallel"]} ins(%953 : tensor<1x256xf32>) outs(%951 : tensor<1x256xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%961 = arith.divf %in, %cst_6 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x256xf32> | |
%955 = tensor.empty() : tensor<256x1000xf32> | |
%956 = linalg.generic {indexing_maps = [#map22, #map23], iterator_types = ["parallel", "parallel"]} ins(%cst_55 : tensor<1000x256xf32>) outs(%955 : tensor<256x1000xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<256x1000xf32> | |
%957 = tensor.empty() : tensor<1x1000xf32> | |
%958 = linalg.fill ins(%cst_0 : f32) outs(%957 : tensor<1x1000xf32>) -> tensor<1x1000xf32> | |
%959 = linalg.matmul ins(%954, %956 : tensor<1x256xf32>, tensor<256x1000xf32>) outs(%958 : tensor<1x1000xf32>) -> tensor<1x1000xf32> | |
%960 = linalg.generic {indexing_maps = [#map21, #map24, #map22], iterator_types = ["parallel", "parallel"]} ins(%959, %cst_54 : tensor<1x1000xf32>, tensor<1000xf32>) outs(%957 : tensor<1x1000xf32>) { | |
^bb0(%in: f32, %in_344: f32, %out: f32): | |
%961 = arith.addf %in, %in_344 : f32 | |
linalg.yield %961 : f32 | |
} -> tensor<1x1000xf32> | |
return %960 : tensor<1x1000xf32> | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment