Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Last active April 16, 2021 17:30
Show Gist options
  • Save antiagainst/c0161fb817550ff19fb89427cf31775c to your computer and use it in GitHub Desktop.
Save antiagainst/c0161fb817550ff19fb89427cf31775c to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// *** IR Dump After CSE ***
func @call(%arg0: tensor<1x224x224x3xf32> {tf._user_specified_name = "x"}) -> tensor<1x1000xf32> attributes {iree.module.export, iree.reflection = {abi = "sip", abiv = 1 : i32, f = "I17!B13!d1d224d224d3R11!B8!d1d1000", fv = "1", sip = "I8!S5!k0_0R3!_0"}, tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x224x224x3>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
%cst = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_0 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_1 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_2 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_3 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_4 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_5 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_6 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_7 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_8 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_9 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x96xf32>
%cst_10 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_11 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_12 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_13 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_14 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_15 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_16 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_17 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_18 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_19 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_20 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x96xf32>
%cst_21 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_22 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_23 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_24 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_25 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_26 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_27 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_28 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_29 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_30 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_31 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x96xf32>
%cst_32 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_33 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_34 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_35 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_36 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_37 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_38 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_39 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_40 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_41 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_42 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x160xf32>
%cst_43 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_44 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_45 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_46 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_47 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_48 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_49 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_50 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_51 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_52 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_53 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x160xf32>
%cst_54 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_55 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_56 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_57 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_58 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_59 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_60 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_61 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_62 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_63 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_64 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x160xf32>
%cst_65 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_66 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_67 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_68 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_69 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_70 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_71 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_72 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_73 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_74 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_75 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x320xf32>
%cst_76 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_77 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_78 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_79 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_80 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_81 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_82 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x16x96xf32>
%cst_83 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_84 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_85 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_86 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x24xf32>
%cst_87 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_88 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_89 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_90 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_91 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_92 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_93 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x24x144xf32>
%cst_94 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_95 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_96 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_97 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x144x24xf32>
%cst_98 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_99 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_100 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_101 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_102 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_103 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_104 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x24x144xf32>
%cst_105 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_106 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_107 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_108 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x144x32xf32>
%cst_109 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_110 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_111 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_112 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_113 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_114 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_115 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_116 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_117 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_118 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_119 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x32xf32>
%cst_120 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_121 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_122 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_123 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_124 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_125 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_126 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_127 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_128 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_129 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_130 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x32xf32>
%cst_131 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_132 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_133 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_134 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_135 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_136 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_137 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_138 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_139 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_140 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_141 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x64xf32>
%cst_142 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_143 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_144 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_145 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_146 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_147 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_148 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_149 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_150 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_151 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_152 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_153 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_154 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_155 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_156 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_157 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_158 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_159 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_160 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_161 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_162 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_163 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_164 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_165 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_166 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_167 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_168 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_169 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_170 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_171 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_172 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_173 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_174 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_175 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_176 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_177 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_178 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x3x32xf32>
%cst_179 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_180 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_181 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_182 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x320x1280xf32>
%cst_183 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_184 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_185 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_186 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_187 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_188 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_189 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x16xf32>
%cst_190 = constant opaque<"_", "0xDEADBEEF"> : tensor<1000xf32>
%cst_191 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280x1000xf32>
%cst_192 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_193 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_194 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_195 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_196 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_197 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_198 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_199 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_200 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_201 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_202 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_203 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_204 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_205 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_206 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_207 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_208 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_209 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_210 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_211 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_212 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_213 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_214 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_215 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_216 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_217 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_218 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_219 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_220 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_221 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_222 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_223 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_224 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_225 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_226 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_227 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_228 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_229 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_230 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_231 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_232 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_233 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_234 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_235 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_236 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_237 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_238 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_239 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_240 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_241 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_242 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_243 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_244 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x32xf32>
%cst_245 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x96xf32>
%cst_246 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x144xf32>
%cst_247 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x144xf32>
%cst_248 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_249 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_250 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_251 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_252 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_253 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_254 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_255 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_256 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_257 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_258 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_259 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_260 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_261 = constant 0xFF800000 : f32
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 0.000000e+00 : f32
%cst_264 = constant 6.000000e+00 : f32
%cst_265 = constant 4.900000e+01 : f32
%cst_266 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1000xf32>
%0 = linalg.init_tensor [1, 225, 225, 3] : tensor<1x225x225x3xf32>
%1 = linalg.fill(%0, %cst_263) : tensor<1x225x225x3xf32>, f32 -> tensor<1x225x225x3xf32>
%2 = subtensor_insert %arg0 into %1[0, 0, 0, 0] [1, 224, 224, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> into tensor<1x225x225x3xf32>
%3 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%4 = linalg.fill(%3, %cst_263) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
%5 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%2, %cst_178 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%4 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %cst_175, %cst_177, %cst_192, %cst_176 : tensor<1x112x112x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) outs(%3 : tensor<1x112x112x32xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x112x112x32xf32>
%7 = linalg.init_tensor [1, 114, 114, 32] : tensor<1x114x114x32xf32>
%8 = linalg.fill(%7, %cst_263) : tensor<1x114x114x32xf32>, f32 -> tensor<1x114x114x32xf32>
%9 = subtensor_insert %6 into %8[0, 1, 1, 0] [1, 112, 112, 32] [1, 1, 1, 1] : tensor<1x112x112x32xf32> into tensor<1x114x114x32xf32>
%10 = linalg.fill(%3, %cst_263) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
%11 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%9, %cst_244 : tensor<1x114x114x32xf32>, tensor<3x3x32xf32>) outs(%10 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11, %cst_183, %cst_185, %cst_193, %cst_184 : tensor<1x112x112x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) outs(%3 : tensor<1x112x112x32xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x112x112x32xf32>
%13 = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
%14 = linalg.fill(%13, %cst_263) : tensor<1x112x112x16xf32>, f32 -> tensor<1x112x112x16xf32>
%15 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%12, %cst_189 : tensor<1x112x112x32xf32>, tensor<1x1x32x16xf32>) outs(%14 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %cst_186, %cst_188, %cst_194, %cst_187 : tensor<1x112x112x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) outs(%13 : tensor<1x112x112x16xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x112x112x16xf32>
%17 = linalg.init_tensor [1, 112, 112, 96] : tensor<1x112x112x96xf32>
%18 = linalg.fill(%17, %cst_263) : tensor<1x112x112x96xf32>, f32 -> tensor<1x112x112x96xf32>
%19 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%16, %cst_82 : tensor<1x112x112x16xf32>, tensor<1x1x16x96xf32>) outs(%18 : tensor<1x112x112x96xf32>) -> tensor<1x112x112x96xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %cst_79, %cst_81, %cst_195, %cst_80 : tensor<1x112x112x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) outs(%17 : tensor<1x112x112x96xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x112x112x96xf32>
%21 = linalg.init_tensor [1, 113, 113, 96] : tensor<1x113x113x96xf32>
%22 = linalg.fill(%21, %cst_263) : tensor<1x113x113x96xf32>, f32 -> tensor<1x113x113x96xf32>
%23 = subtensor_insert %20 into %22[0, 0, 0, 0] [1, 112, 112, 96] [1, 1, 1, 1] : tensor<1x112x112x96xf32> into tensor<1x113x113x96xf32>
%24 = linalg.init_tensor [1, 56, 56, 96] : tensor<1x56x56x96xf32>
%25 = linalg.fill(%24, %cst_263) : tensor<1x56x56x96xf32>, f32 -> tensor<1x56x56x96xf32>
%26 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%23, %cst_245 : tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) outs(%25 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26, %cst_76, %cst_78, %cst_196, %cst_77 : tensor<1x56x56x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) outs(%24 : tensor<1x56x56x96xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x56x56x96xf32>
%28 = linalg.init_tensor [1, 56, 56, 24] : tensor<1x56x56x24xf32>
%29 = linalg.fill(%28, %cst_263) : tensor<1x56x56x24xf32>, f32 -> tensor<1x56x56x24xf32>
%30 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%27, %cst_86 : tensor<1x56x56x96xf32>, tensor<1x1x96x24xf32>) outs(%29 : tensor<1x56x56x24xf32>) -> tensor<1x56x56x24xf32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30, %cst_83, %cst_85, %cst_197, %cst_84 : tensor<1x56x56x24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>) outs(%28 : tensor<1x56x56x24xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x56x56x24xf32>
%32 = linalg.init_tensor [1, 56, 56, 144] : tensor<1x56x56x144xf32>
%33 = linalg.fill(%32, %cst_263) : tensor<1x56x56x144xf32>, f32 -> tensor<1x56x56x144xf32>
%34 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%31, %cst_93 : tensor<1x56x56x24xf32>, tensor<1x1x24x144xf32>) outs(%33 : tensor<1x56x56x144xf32>) -> tensor<1x56x56x144xf32>
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%34, %cst_90, %cst_92, %cst_198, %cst_91 : tensor<1x56x56x144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>) outs(%32 : tensor<1x56x56x144xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x56x56x144xf32>
%36 = linalg.init_tensor [1, 58, 58, 144] : tensor<1x58x58x144xf32>
%37 = linalg.fill(%36, %cst_263) : tensor<1x58x58x144xf32>, f32 -> tensor<1x58x58x144xf32>
%38 = subtensor_insert %35 into %37[0, 1, 1, 0] [1, 56, 56, 144] [1, 1, 1, 1] : tensor<1x56x56x144xf32> into tensor<1x58x58x144xf32>
%39 = linalg.fill(%32, %cst_263) : tensor<1x56x56x144xf32>, f32 -> tensor<1x56x56x144xf32>
%40 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%38, %cst_246 : tensor<1x58x58x144xf32>, tensor<3x3x144xf32>) outs(%39 : tensor<1x56x56x144xf32>) -> tensor<1x56x56x144xf32>
%41 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%40, %cst_87, %cst_89, %cst_199, %cst_88 : tensor<1x56x56x144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>) outs(%32 : tensor<1x56x56x144xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x56x56x144xf32>
%42 = linalg.fill(%28, %cst_263) : tensor<1x56x56x24xf32>, f32 -> tensor<1x56x56x24xf32>
%43 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%41, %cst_97 : tensor<1x56x56x144xf32>, tensor<1x1x144x24xf32>) outs(%42 : tensor<1x56x56x24xf32>) -> tensor<1x56x56x24xf32>
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%31, %43, %cst_94, %cst_96, %cst_200, %cst_95 : tensor<1x56x56x24xf32>, tensor<1x56x56x24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>) outs(%28 : tensor<1x56x56x24xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x56x56x24xf32>
%45 = linalg.fill(%32, %cst_263) : tensor<1x56x56x144xf32>, f32 -> tensor<1x56x56x144xf32>
%46 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%44, %cst_104 : tensor<1x56x56x24xf32>, tensor<1x1x24x144xf32>) outs(%45 : tensor<1x56x56x144xf32>) -> tensor<1x56x56x144xf32>
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %cst_101, %cst_103, %cst_201, %cst_102 : tensor<1x56x56x144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>) outs(%32 : tensor<1x56x56x144xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x56x56x144xf32>
%48 = linalg.init_tensor [1, 57, 57, 144] : tensor<1x57x57x144xf32>
%49 = linalg.fill(%48, %cst_263) : tensor<1x57x57x144xf32>, f32 -> tensor<1x57x57x144xf32>
%50 = subtensor_insert %47 into %49[0, 0, 0, 0] [1, 56, 56, 144] [1, 1, 1, 1] : tensor<1x56x56x144xf32> into tensor<1x57x57x144xf32>
%51 = linalg.init_tensor [1, 28, 28, 144] : tensor<1x28x28x144xf32>
%52 = linalg.fill(%51, %cst_263) : tensor<1x28x28x144xf32>, f32 -> tensor<1x28x28x144xf32>
%53 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%50, %cst_247 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%52 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %cst_98, %cst_100, %cst_202, %cst_99 : tensor<1x28x28x144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>) outs(%51 : tensor<1x28x28x144xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x28x28x144xf32>
%55 = linalg.init_tensor [1, 28, 28, 32] : tensor<1x28x28x32xf32>
%56 = linalg.fill(%55, %cst_263) : tensor<1x28x28x32xf32>, f32 -> tensor<1x28x28x32xf32>
%57 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%54, %cst_108 : tensor<1x28x28x144xf32>, tensor<1x1x144x32xf32>) outs(%56 : tensor<1x28x28x32xf32>) -> tensor<1x28x28x32xf32>
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%57, %cst_105, %cst_107, %cst_203, %cst_106 : tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) outs(%55 : tensor<1x28x28x32xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x28x28x32xf32>
%59 = linalg.init_tensor [1, 28, 28, 192] : tensor<1x28x28x192xf32>
%60 = linalg.fill(%59, %cst_263) : tensor<1x28x28x192xf32>, f32 -> tensor<1x28x28x192xf32>
%61 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%58, %cst_115 : tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) outs(%60 : tensor<1x28x28x192xf32>) -> tensor<1x28x28x192xf32>
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%61, %cst_112, %cst_114, %cst_204, %cst_113 : tensor<1x28x28x192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>) outs(%59 : tensor<1x28x28x192xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x28x28x192xf32>
%63 = linalg.init_tensor [1, 30, 30, 192] : tensor<1x30x30x192xf32>
%64 = linalg.fill(%63, %cst_263) : tensor<1x30x30x192xf32>, f32 -> tensor<1x30x30x192xf32>
%65 = subtensor_insert %62 into %64[0, 1, 1, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x30x30x192xf32>
%66 = linalg.fill(%59, %cst_263) : tensor<1x28x28x192xf32>, f32 -> tensor<1x28x28x192xf32>
%67 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%65, %cst_248 : tensor<1x30x30x192xf32>, tensor<3x3x192xf32>) outs(%66 : tensor<1x28x28x192xf32>) -> tensor<1x28x28x192xf32>
%68 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%67, %cst_109, %cst_111, %cst_205, %cst_110 : tensor<1x28x28x192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>) outs(%59 : tensor<1x28x28x192xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x28x28x192xf32>
%69 = linalg.fill(%55, %cst_263) : tensor<1x28x28x32xf32>, f32 -> tensor<1x28x28x32xf32>
%70 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%68, %cst_119 : tensor<1x28x28x192xf32>, tensor<1x1x192x32xf32>) outs(%69 : tensor<1x28x28x32xf32>) -> tensor<1x28x28x32xf32>
%71 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%58, %70, %cst_116, %cst_118, %cst_206, %cst_117 : tensor<1x28x28x32xf32>, tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) outs(%55 : tensor<1x28x28x32xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x28x28x32xf32>
%72 = linalg.fill(%59, %cst_263) : tensor<1x28x28x192xf32>, f32 -> tensor<1x28x28x192xf32>
%73 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%71, %cst_126 : tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) outs(%72 : tensor<1x28x28x192xf32>) -> tensor<1x28x28x192xf32>
%74 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%73, %cst_123, %cst_125, %cst_207, %cst_124 : tensor<1x28x28x192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>) outs(%59 : tensor<1x28x28x192xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x28x28x192xf32>
%75 = linalg.fill(%63, %cst_263) : tensor<1x30x30x192xf32>, f32 -> tensor<1x30x30x192xf32>
%76 = subtensor_insert %74 into %75[0, 1, 1, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x30x30x192xf32>
%77 = linalg.fill(%59, %cst_263) : tensor<1x28x28x192xf32>, f32 -> tensor<1x28x28x192xf32>
%78 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%76, %cst_249 : tensor<1x30x30x192xf32>, tensor<3x3x192xf32>) outs(%77 : tensor<1x28x28x192xf32>) -> tensor<1x28x28x192xf32>
%79 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%78, %cst_120, %cst_122, %cst_208, %cst_121 : tensor<1x28x28x192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>) outs(%59 : tensor<1x28x28x192xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x28x28x192xf32>
%80 = linalg.fill(%55, %cst_263) : tensor<1x28x28x32xf32>, f32 -> tensor<1x28x28x32xf32>
%81 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%79, %cst_130 : tensor<1x28x28x192xf32>, tensor<1x1x192x32xf32>) outs(%80 : tensor<1x28x28x32xf32>) -> tensor<1x28x28x32xf32>
%82 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%71, %81, %cst_127, %cst_129, %cst_209, %cst_128 : tensor<1x28x28x32xf32>, tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) outs(%55 : tensor<1x28x28x32xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x28x28x32xf32>
%83 = linalg.fill(%59, %cst_263) : tensor<1x28x28x192xf32>, f32 -> tensor<1x28x28x192xf32>
%84 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%82, %cst_137 : tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) outs(%83 : tensor<1x28x28x192xf32>) -> tensor<1x28x28x192xf32>
%85 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%84, %cst_134, %cst_136, %cst_210, %cst_135 : tensor<1x28x28x192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>) outs(%59 : tensor<1x28x28x192xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x28x28x192xf32>
%86 = linalg.init_tensor [1, 29, 29, 192] : tensor<1x29x29x192xf32>
%87 = linalg.fill(%86, %cst_263) : tensor<1x29x29x192xf32>, f32 -> tensor<1x29x29x192xf32>
%88 = subtensor_insert %85 into %87[0, 0, 0, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x29x29x192xf32>
%89 = linalg.init_tensor [1, 14, 14, 192] : tensor<1x14x14x192xf32>
%90 = linalg.fill(%89, %cst_263) : tensor<1x14x14x192xf32>, f32 -> tensor<1x14x14x192xf32>
%91 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%88, %cst_250 : tensor<1x29x29x192xf32>, tensor<3x3x192xf32>) outs(%90 : tensor<1x14x14x192xf32>) -> tensor<1x14x14x192xf32>
%92 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%91, %cst_131, %cst_133, %cst_211, %cst_132 : tensor<1x14x14x192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>) outs(%89 : tensor<1x14x14x192xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x192xf32>
%93 = linalg.init_tensor [1, 14, 14, 64] : tensor<1x14x14x64xf32>
%94 = linalg.fill(%93, %cst_263) : tensor<1x14x14x64xf32>, f32 -> tensor<1x14x14x64xf32>
%95 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%92, %cst_141 : tensor<1x14x14x192xf32>, tensor<1x1x192x64xf32>) outs(%94 : tensor<1x14x14x64xf32>) -> tensor<1x14x14x64xf32>
%96 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%95, %cst_138, %cst_140, %cst_212, %cst_139 : tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%93 : tensor<1x14x14x64xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x14x14x64xf32>
%97 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%98 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%99 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%96, %cst_148 : tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) outs(%98 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%99, %cst_145, %cst_147, %cst_213, %cst_146 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%101 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%102 = linalg.fill(%101, %cst_263) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
%103 = subtensor_insert %100 into %102[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
%104 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%105 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%103, %cst_251 : tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) outs(%104 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%106 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%105, %cst_142, %cst_144, %cst_214, %cst_143 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%107 = linalg.fill(%93, %cst_263) : tensor<1x14x14x64xf32>, f32 -> tensor<1x14x14x64xf32>
%108 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%106, %cst_152 : tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) outs(%107 : tensor<1x14x14x64xf32>) -> tensor<1x14x14x64xf32>
%109 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%96, %108, %cst_149, %cst_151, %cst_215, %cst_150 : tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%93 : tensor<1x14x14x64xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x14x14x64xf32>
%110 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%111 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%109, %cst_159 : tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) outs(%110 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%112 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%111, %cst_156, %cst_158, %cst_216, %cst_157 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%113 = linalg.fill(%101, %cst_263) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
%114 = subtensor_insert %112 into %113[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
%115 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%116 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%114, %cst_252 : tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) outs(%115 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%117 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%116, %cst_153, %cst_155, %cst_217, %cst_154 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%118 = linalg.fill(%93, %cst_263) : tensor<1x14x14x64xf32>, f32 -> tensor<1x14x14x64xf32>
%119 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%117, %cst_163 : tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) outs(%118 : tensor<1x14x14x64xf32>) -> tensor<1x14x14x64xf32>
%120 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%109, %119, %cst_160, %cst_162, %cst_218, %cst_161 : tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%93 : tensor<1x14x14x64xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x14x14x64xf32>
%121 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%120, %cst_170 : tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) outs(%121 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %cst_167, %cst_169, %cst_219, %cst_168 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%124 = linalg.fill(%101, %cst_263) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
%125 = subtensor_insert %123 into %124[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
%126 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%127 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%125, %cst_253 : tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) outs(%126 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %cst_164, %cst_166, %cst_220, %cst_165 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%129 = linalg.fill(%93, %cst_263) : tensor<1x14x14x64xf32>, f32 -> tensor<1x14x14x64xf32>
%130 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%128, %cst_174 : tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) outs(%129 : tensor<1x14x14x64xf32>) -> tensor<1x14x14x64xf32>
%131 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%120, %130, %cst_171, %cst_173, %cst_221, %cst_172 : tensor<1x14x14x64xf32>, tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%93 : tensor<1x14x14x64xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x14x14x64xf32>
%132 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%133 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%131, %cst_5 : tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) outs(%132 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%134 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%133, %cst_2, %cst_4, %cst_222, %cst_3 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%135 = linalg.fill(%101, %cst_263) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
%136 = subtensor_insert %134 into %135[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
%137 = linalg.fill(%97, %cst_263) : tensor<1x14x14x384xf32>, f32 -> tensor<1x14x14x384xf32>
%138 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%136, %cst_254 : tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) outs(%137 : tensor<1x14x14x384xf32>) -> tensor<1x14x14x384xf32>
%139 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%138, %cst, %cst_1, %cst_223, %cst_0 : tensor<1x14x14x384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>) outs(%97 : tensor<1x14x14x384xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x384xf32>
%140 = linalg.init_tensor [1, 14, 14, 96] : tensor<1x14x14x96xf32>
%141 = linalg.fill(%140, %cst_263) : tensor<1x14x14x96xf32>, f32 -> tensor<1x14x14x96xf32>
%142 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%139, %cst_9 : tensor<1x14x14x384xf32>, tensor<1x1x384x96xf32>) outs(%141 : tensor<1x14x14x96xf32>) -> tensor<1x14x14x96xf32>
%143 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%142, %cst_6, %cst_8, %cst_224, %cst_7 : tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) outs(%140 : tensor<1x14x14x96xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x14x14x96xf32>
%144 = linalg.init_tensor [1, 14, 14, 576] : tensor<1x14x14x576xf32>
%145 = linalg.fill(%144, %cst_263) : tensor<1x14x14x576xf32>, f32 -> tensor<1x14x14x576xf32>
%146 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%143, %cst_16 : tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) outs(%145 : tensor<1x14x14x576xf32>) -> tensor<1x14x14x576xf32>
%147 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%146, %cst_13, %cst_15, %cst_225, %cst_14 : tensor<1x14x14x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) outs(%144 : tensor<1x14x14x576xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x576xf32>
%148 = linalg.init_tensor [1, 16, 16, 576] : tensor<1x16x16x576xf32>
%149 = linalg.fill(%148, %cst_263) : tensor<1x16x16x576xf32>, f32 -> tensor<1x16x16x576xf32>
%150 = subtensor_insert %147 into %149[0, 1, 1, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x16x16x576xf32>
%151 = linalg.fill(%144, %cst_263) : tensor<1x14x14x576xf32>, f32 -> tensor<1x14x14x576xf32>
%152 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%150, %cst_255 : tensor<1x16x16x576xf32>, tensor<3x3x576xf32>) outs(%151 : tensor<1x14x14x576xf32>) -> tensor<1x14x14x576xf32>
%153 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%152, %cst_10, %cst_12, %cst_226, %cst_11 : tensor<1x14x14x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) outs(%144 : tensor<1x14x14x576xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x576xf32>
%154 = linalg.fill(%140, %cst_263) : tensor<1x14x14x96xf32>, f32 -> tensor<1x14x14x96xf32>
%155 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%153, %cst_20 : tensor<1x14x14x576xf32>, tensor<1x1x576x96xf32>) outs(%154 : tensor<1x14x14x96xf32>) -> tensor<1x14x14x96xf32>
%156 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%143, %155, %cst_17, %cst_19, %cst_227, %cst_18 : tensor<1x14x14x96xf32>, tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) outs(%140 : tensor<1x14x14x96xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x14x14x96xf32>
%157 = linalg.fill(%144, %cst_263) : tensor<1x14x14x576xf32>, f32 -> tensor<1x14x14x576xf32>
%158 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%156, %cst_27 : tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) outs(%157 : tensor<1x14x14x576xf32>) -> tensor<1x14x14x576xf32>
%159 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%158, %cst_24, %cst_26, %cst_228, %cst_25 : tensor<1x14x14x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) outs(%144 : tensor<1x14x14x576xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x576xf32>
%160 = linalg.fill(%148, %cst_263) : tensor<1x16x16x576xf32>, f32 -> tensor<1x16x16x576xf32>
%161 = subtensor_insert %159 into %160[0, 1, 1, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x16x16x576xf32>
%162 = linalg.fill(%144, %cst_263) : tensor<1x14x14x576xf32>, f32 -> tensor<1x14x14x576xf32>
%163 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%161, %cst_256 : tensor<1x16x16x576xf32>, tensor<3x3x576xf32>) outs(%162 : tensor<1x14x14x576xf32>) -> tensor<1x14x14x576xf32>
%164 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%163, %cst_21, %cst_23, %cst_229, %cst_22 : tensor<1x14x14x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) outs(%144 : tensor<1x14x14x576xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x576xf32>
%165 = linalg.fill(%140, %cst_263) : tensor<1x14x14x96xf32>, f32 -> tensor<1x14x14x96xf32>
%166 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%164, %cst_31 : tensor<1x14x14x576xf32>, tensor<1x1x576x96xf32>) outs(%165 : tensor<1x14x14x96xf32>) -> tensor<1x14x14x96xf32>
%167 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%156, %166, %cst_28, %cst_30, %cst_230, %cst_29 : tensor<1x14x14x96xf32>, tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) outs(%140 : tensor<1x14x14x96xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x14x14x96xf32>
%168 = linalg.fill(%144, %cst_263) : tensor<1x14x14x576xf32>, f32 -> tensor<1x14x14x576xf32>
%169 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%167, %cst_38 : tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) outs(%168 : tensor<1x14x14x576xf32>) -> tensor<1x14x14x576xf32>
%170 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%169, %cst_35, %cst_37, %cst_231, %cst_36 : tensor<1x14x14x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) outs(%144 : tensor<1x14x14x576xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x14x14x576xf32>
%171 = linalg.init_tensor [1, 15, 15, 576] : tensor<1x15x15x576xf32>
%172 = linalg.fill(%171, %cst_263) : tensor<1x15x15x576xf32>, f32 -> tensor<1x15x15x576xf32>
%173 = subtensor_insert %170 into %172[0, 0, 0, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x15x15x576xf32>
%174 = linalg.init_tensor [1, 7, 7, 576] : tensor<1x7x7x576xf32>
%175 = linalg.fill(%174, %cst_263) : tensor<1x7x7x576xf32>, f32 -> tensor<1x7x7x576xf32>
%176 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%173, %cst_257 : tensor<1x15x15x576xf32>, tensor<3x3x576xf32>) outs(%175 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
%177 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%176, %cst_32, %cst_34, %cst_232, %cst_33 : tensor<1x7x7x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) outs(%174 : tensor<1x7x7x576xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x576xf32>
%178 = linalg.init_tensor [1, 7, 7, 160] : tensor<1x7x7x160xf32>
%179 = linalg.fill(%178, %cst_263) : tensor<1x7x7x160xf32>, f32 -> tensor<1x7x7x160xf32>
%180 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%177, %cst_42 : tensor<1x7x7x576xf32>, tensor<1x1x576x160xf32>) outs(%179 : tensor<1x7x7x160xf32>) -> tensor<1x7x7x160xf32>
%181 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%180, %cst_39, %cst_41, %cst_233, %cst_40 : tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>) outs(%178 : tensor<1x7x7x160xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x7x7x160xf32>
%182 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%183 = linalg.fill(%182, %cst_263) : tensor<1x7x7x960xf32>, f32 -> tensor<1x7x7x960xf32>
%184 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%181, %cst_49 : tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) outs(%183 : tensor<1x7x7x960xf32>) -> tensor<1x7x7x960xf32>
%185 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%184, %cst_46, %cst_48, %cst_234, %cst_47 : tensor<1x7x7x960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>) outs(%182 : tensor<1x7x7x960xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x960xf32>
%186 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%187 = linalg.fill(%186, %cst_263) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
%188 = subtensor_insert %185 into %187[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
%189 = linalg.fill(%182, %cst_263) : tensor<1x7x7x960xf32>, f32 -> tensor<1x7x7x960xf32>
%190 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%188, %cst_258 : tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) outs(%189 : tensor<1x7x7x960xf32>) -> tensor<1x7x7x960xf32>
%191 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%190, %cst_43, %cst_45, %cst_235, %cst_44 : tensor<1x7x7x960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>) outs(%182 : tensor<1x7x7x960xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x960xf32>
%192 = linalg.fill(%178, %cst_263) : tensor<1x7x7x160xf32>, f32 -> tensor<1x7x7x160xf32>
%193 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%191, %cst_53 : tensor<1x7x7x960xf32>, tensor<1x1x960x160xf32>) outs(%192 : tensor<1x7x7x160xf32>) -> tensor<1x7x7x160xf32>
%194 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%181, %193, %cst_50, %cst_52, %cst_236, %cst_51 : tensor<1x7x7x160xf32>, tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>) outs(%178 : tensor<1x7x7x160xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x7x7x160xf32>
%195 = linalg.fill(%182, %cst_263) : tensor<1x7x7x960xf32>, f32 -> tensor<1x7x7x960xf32>
%196 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%194, %cst_60 : tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) outs(%195 : tensor<1x7x7x960xf32>) -> tensor<1x7x7x960xf32>
%197 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%196, %cst_57, %cst_59, %cst_237, %cst_58 : tensor<1x7x7x960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>) outs(%182 : tensor<1x7x7x960xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x960xf32>
%198 = linalg.fill(%186, %cst_263) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
%199 = subtensor_insert %197 into %198[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
%200 = linalg.fill(%182, %cst_263) : tensor<1x7x7x960xf32>, f32 -> tensor<1x7x7x960xf32>
%201 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%199, %cst_259 : tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) outs(%200 : tensor<1x7x7x960xf32>) -> tensor<1x7x7x960xf32>
%202 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%201, %cst_54, %cst_56, %cst_238, %cst_55 : tensor<1x7x7x960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>) outs(%182 : tensor<1x7x7x960xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x960xf32>
%203 = linalg.fill(%178, %cst_263) : tensor<1x7x7x160xf32>, f32 -> tensor<1x7x7x160xf32>
%204 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%202, %cst_64 : tensor<1x7x7x960xf32>, tensor<1x1x960x160xf32>) outs(%203 : tensor<1x7x7x160xf32>) -> tensor<1x7x7x160xf32>
%205 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%194, %204, %cst_61, %cst_63, %cst_239, %cst_62 : tensor<1x7x7x160xf32>, tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>) outs(%178 : tensor<1x7x7x160xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%237 = subf %arg2, %arg3 : f32
%238 = mulf %237, %arg4 : f32
%239 = divf %238, %arg5 : f32
%240 = addf %239, %arg6 : f32
%241 = addf %arg1, %240 : f32
linalg.yield %241 : f32
} -> tensor<1x7x7x160xf32>
%206 = linalg.fill(%182, %cst_263) : tensor<1x7x7x960xf32>, f32 -> tensor<1x7x7x960xf32>
%207 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%205, %cst_71 : tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) outs(%206 : tensor<1x7x7x960xf32>) -> tensor<1x7x7x960xf32>
%208 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%207, %cst_68, %cst_70, %cst_240, %cst_69 : tensor<1x7x7x960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>) outs(%182 : tensor<1x7x7x960xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x960xf32>
%209 = linalg.fill(%186, %cst_263) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
%210 = subtensor_insert %208 into %209[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
%211 = linalg.fill(%182, %cst_263) : tensor<1x7x7x960xf32>, f32 -> tensor<1x7x7x960xf32>
%212 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%210, %cst_260 : tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) outs(%211 : tensor<1x7x7x960xf32>) -> tensor<1x7x7x960xf32>
%213 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%212, %cst_65, %cst_67, %cst_241, %cst_66 : tensor<1x7x7x960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>) outs(%182 : tensor<1x7x7x960xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
linalg.yield %248 : f32
} -> tensor<1x7x7x960xf32>
%214 = linalg.init_tensor [1, 7, 7, 320] : tensor<1x7x7x320xf32>
%215 = linalg.fill(%214, %cst_263) : tensor<1x7x7x320xf32>, f32 -> tensor<1x7x7x320xf32>
%216 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%213, %cst_75 : tensor<1x7x7x960xf32>, tensor<1x1x960x320xf32>) outs(%215 : tensor<1x7x7x320xf32>) -> tensor<1x7x7x320xf32>
%217 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%216, %cst_72, %cst_74, %cst_242, %cst_73 : tensor<1x7x7x320xf32>, tensor<320xf32>, tensor<320xf32>, tensor<320xf32>, tensor<320xf32>) outs(%214 : tensor<1x7x7x320xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
linalg.yield %240 : f32
} -> tensor<1x7x7x320xf32>
%218 = linalg.init_tensor [1, 7, 7, 1280] : tensor<1x7x7x1280xf32>
%219 = linalg.fill(%218, %cst_263) : tensor<1x7x7x1280xf32>, f32 -> tensor<1x7x7x1280xf32>
%220 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%217, %cst_182 : tensor<1x7x7x320xf32>, tensor<1x1x320x1280xf32>) outs(%219 : tensor<1x7x7x1280xf32>) -> tensor<1x7x7x1280xf32>
%221 = linalg.tensor_reshape %220 [affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d2)>, affine_map<(d0, d1, d2, d3) -> (d3)>] : tensor<1x7x7x1280xf32> into tensor<7x7x1280xf32>
%222 = linalg.init_tensor [1280] : tensor<1280xf32>
%223 = linalg.fill(%222, %cst_263) : tensor<1280xf32>, f32 -> tensor<1280xf32>
%224 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%221, %cst_179, %cst_181, %cst_243, %cst_180 : tensor<7x7x1280xf32>, tensor<1280xf32>, tensor<1280xf32>, tensor<1280xf32>, tensor<1280xf32>) outs(%223 : tensor<1280xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%237 = subf %arg1, %arg2 : f32
%238 = mulf %237, %arg3 : f32
%239 = divf %238, %arg4 : f32
%240 = addf %239, %arg5 : f32
%241 = cmpf olt, %240, %cst_264 : f32
%242 = select %241, %240, %cst_264 : f32
%243 = cmpf uno, %240, %cst_264 : f32
%244 = select %243, %cst_262, %242 : f32
%245 = cmpf ogt, %244, %cst_263 : f32
%246 = select %245, %244, %cst_263 : f32
%247 = cmpf uno, %244, %cst_263 : f32
%248 = select %247, %cst_262, %246 : f32
%249 = addf %248, %arg6 : f32
linalg.yield %249 : f32
} -> tensor<1280xf32>
%225 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%224 : tensor<1280xf32>) outs(%222 : tensor<1280xf32>) {
^bb0(%arg1: f32, %arg2: f32): // no predecessors
%237 = divf %arg1, %cst_265 : f32
linalg.yield %237 : f32
} -> tensor<1280xf32>
%226 = linalg.tensor_reshape %225 [affine_map<(d0, d1) -> (d0, d1)>] : tensor<1280xf32> into tensor<1x1280xf32>
%227 = linalg.init_tensor [1, 1000] : tensor<1x1000xf32>
%228 = linalg.fill(%227, %cst_263) : tensor<1x1000xf32>, f32 -> tensor<1x1000xf32>
%229 = linalg.matmul ins(%226, %cst_191 : tensor<1x1280xf32>, tensor<1280x1000xf32>) outs(%228 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
%230 = linalg.tensor_reshape %229 [affine_map<(d0, d1) -> (d0, d1)>] : tensor<1x1000xf32> into tensor<1000xf32>
%231 = linalg.init_tensor [] : tensor<f32>
%232 = linalg.fill(%231, %cst_261) : tensor<f32>, f32 -> tensor<f32>
%233 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%230, %cst_190 : tensor<1000xf32>, tensor<1000xf32>) outs(%232 : tensor<f32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32): // no predecessors
%237 = addf %arg1, %arg2 : f32
%238 = cmpf ogt, %237, %arg3 : f32
%239 = select %238, %237, %arg3 : f32
%240 = cmpf uno, %237, %arg3 : f32
%241 = select %240, %cst_262, %239 : f32
linalg.yield %241 : f32
} -> tensor<f32>
%234 = linalg.fill(%231, %cst_263) : tensor<f32>, f32 -> tensor<f32>
%235 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%230, %cst_190, %233 : tensor<1000xf32>, tensor<1000xf32>, tensor<f32>) outs(%234 : tensor<f32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%237 = addf %arg1, %arg2 : f32
%238 = subf %237, %arg3 : f32
%239 = math.exp %238 : f32
%240 = addf %239, %arg4 : f32
linalg.yield %240 : f32
} -> tensor<f32>
%236 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%229, %cst_266, %233, %235 : tensor<1x1000xf32>, tensor<1x1000xf32>, tensor<f32>, tensor<f32>) outs(%227 : tensor<1x1000xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
%237 = addf %arg1, %arg2 : f32
%238 = subf %237, %arg3 : f32
%239 = math.exp %238 : f32
%240 = divf %239, %arg4 : f32
linalg.yield %240 : f32
} -> tensor<1x1000xf32>
return %236 : tensor<1x1000xf32>
}
// *** IR Dump After mlir::iree_compiler::IREE::Flow::(anonymous namespace)::DispatchLinalgOnTensorsPass ***
func @call(%arg0: tensor<1x224x224x3xf32> {tf._user_specified_name = "x"}) -> tensor<1x1000xf32> attributes {iree.module.export, iree.reflection = {abi = "sip", abiv = 1 : i32, f = "I17!B13!d1d224d224d3R11!B8!d1d1000", fv = "1", sip = "I8!S5!k0_0R3!_0"}, tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x224x224x3>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
%cst = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_0 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_1 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_2 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_3 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_4 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_5 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_6 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_7 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_8 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_9 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x96xf32>
%cst_10 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_11 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_12 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_13 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_14 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_15 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_16 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_17 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_18 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_19 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_20 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x96xf32>
%cst_21 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_22 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_23 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_24 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_25 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_26 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_27 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_28 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_29 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_30 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_31 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x96xf32>
%cst_32 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_33 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_34 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_35 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_36 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_37 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_38 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_39 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_40 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_41 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_42 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x160xf32>
%cst_43 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_44 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_45 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_46 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_47 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_48 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_49 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_50 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_51 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_52 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_53 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x160xf32>
%cst_54 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_55 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_56 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_57 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_58 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_59 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_60 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_61 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_62 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_63 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_64 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x160xf32>
%cst_65 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_66 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_67 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_68 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_69 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_70 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_71 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_72 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_73 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_74 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_75 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x320xf32>
%cst_76 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_77 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_78 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_79 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_80 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_81 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_82 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x16x96xf32>
%cst_83 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_84 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_85 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_86 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x24xf32>
%cst_87 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_88 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_89 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_90 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_91 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_92 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_93 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x24x144xf32>
%cst_94 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_95 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_96 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_97 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x144x24xf32>
%cst_98 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_99 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_100 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_101 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_102 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_103 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_104 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x24x144xf32>
%cst_105 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_106 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_107 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_108 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x144x32xf32>
%cst_109 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_110 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_111 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_112 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_113 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_114 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_115 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_116 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_117 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_118 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_119 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x32xf32>
%cst_120 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_121 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_122 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_123 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_124 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_125 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_126 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_127 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_128 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_129 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_130 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x32xf32>
%cst_131 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_132 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_133 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_134 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_135 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_136 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_137 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_138 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_139 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_140 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_141 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x64xf32>
%cst_142 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_143 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_144 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_145 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_146 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_147 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_148 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_149 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_150 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_151 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_152 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_153 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_154 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_155 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_156 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_157 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_158 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_159 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_160 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_161 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_162 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_163 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_164 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_165 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_166 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_167 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_168 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_169 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_170 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_171 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_172 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_173 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_174 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_175 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_176 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_177 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_178 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x3x32xf32>
%cst_179 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_180 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_181 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_182 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x320x1280xf32>
%cst_183 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_184 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_185 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_186 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_187 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_188 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_189 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x16xf32>
%cst_190 = constant opaque<"_", "0xDEADBEEF"> : tensor<1000xf32>
%cst_191 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280x1000xf32>
%cst_192 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_193 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_194 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_195 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_196 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_197 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_198 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_199 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_200 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_201 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_202 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_203 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_204 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_205 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_206 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_207 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_208 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_209 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_210 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_211 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_212 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_213 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_214 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_215 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_216 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_217 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_218 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_219 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_220 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_221 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_222 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_223 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_224 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_225 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_226 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_227 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_228 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_229 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_230 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_231 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_232 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_233 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_234 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_235 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_236 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_237 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_238 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_239 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_240 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_241 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_242 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_243 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_244 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x32xf32>
%cst_245 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x96xf32>
%cst_246 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x144xf32>
%cst_247 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x144xf32>
%cst_248 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_249 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_250 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_251 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_252 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_253 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_254 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_255 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_256 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_257 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_258 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_259 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_260 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_261 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1000xf32>
%c112 = constant 112 : index
%c24 = constant 24 : index
%c56 = constant 56 : index
%c28 = constant 28 : index
%c64 = constant 64 : index
%c14 = constant 14 : index
%c160 = constant 160 : index
%c320 = constant 320 : index
%c7 = constant 7 : index
%c1280 = constant 1280 : index
%c1000 = constant 1000 : index
%c3 = constant 3 : index
%c225 = constant 225 : index
%c32 = constant 32 : index
%c114 = constant 114 : index
%c96 = constant 96 : index
%c113 = constant 113 : index
%c58 = constant 58 : index
%c144 = constant 144 : index
%c57 = constant 57 : index
%c30 = constant 30 : index
%c192 = constant 192 : index
%c29 = constant 29 : index
%c384 = constant 384 : index
%c16 = constant 16 : index
%c576 = constant 576 : index
%c15 = constant 15 : index
%c960 = constant 960 : index
%c9 = constant 9 : index
%c1 = constant 1 : index
%0 = flow.dispatch.workgroups[%c3, %c225, %c225]() : () -> tensor<1x225x225x3xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x225x225x3xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 225, 225, 3] : tensor<1x225x225x3xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x225x225x3xf32>, f32 -> tensor<1x225x225x3xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x225x225x3xf32> -> !flow.dispatch.tensor<writeonly:1x225x225x3xf32>
flow.return
}
%1 = flow.dispatch.workgroups[%c3, %c225, %c225](%arg0, %0) : (tensor<1x224x224x3xf32>, tensor<1x225x225x3xf32>) -> %0 =
(%arg1: !flow.dispatch.tensor<readonly:1x224x224x3xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x225x225x3xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x225x225x3xf32> -> tensor<1x225x225x3xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 224, 224, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> into tensor<1x225x225x3xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x225x225x3xf32> -> !flow.dispatch.tensor<readwrite:1x225x225x3xf32>
flow.return
}
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%cst_175, %cst_177, %cst_192, %cst_176, %1, %cst_178) : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c112_265 = constant 112 : index
%c32_266 = constant 32 : index
%94 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c112_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c112_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c32_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%115 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%116 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%101, %arg8)
%117 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%118 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%102, %arg9)
%119 = flow.dispatch.tensor.load %arg5, offsets = [0, %115, %117, 0], sizes = [1, %116, %118, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [3, 3, 3, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%125 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%131 = subf %arg11, %arg12 : f32
%132 = mulf %131, %arg13 : f32
%133 = divf %132, %arg14 : f32
%134 = addf %133, %arg15 : f32
%135 = cmpf olt, %134, %cst_263 : f32
%136 = select %135, %134, %cst_263 : f32
%137 = cmpf uno, %134, %cst_263 : f32
%138 = select %137, %cst_262, %136 : f32
%139 = cmpf ogt, %138, %cst_264 : f32
%140 = select %139, %138, %cst_264 : f32
%141 = cmpf uno, %138, %cst_264 : f32
%142 = select %141, %cst_262, %140 : f32
linalg.yield %142 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = flow.dispatch.workgroups[%c32, %c114, %c114]() : () -> tensor<1x114x114x32xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x114x114x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 114, 114, 32] : tensor<1x114x114x32xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x114x114x32xf32>, f32 -> tensor<1x114x114x32xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x114x114x32xf32> -> !flow.dispatch.tensor<writeonly:1x114x114x32xf32>
flow.return
}
%4 = flow.dispatch.workgroups[%c32, %c114, %c114](%2, %3) : (tensor<1x112x112x32xf32>, tensor<1x114x114x32xf32>) -> %3 =
(%arg1: !flow.dispatch.tensor<readonly:1x112x112x32xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x114x114x32xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x114x114x32xf32> -> tensor<1x114x114x32xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 112, 112, 32] [1, 1, 1, 1] : tensor<1x112x112x32xf32> into tensor<1x114x114x32xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x114x114x32xf32> -> !flow.dispatch.tensor<readwrite:1x114x114x32xf32>
flow.return
}
%5 = flow.dispatch.workgroups[%c32, %c112, %c112](%cst_183, %cst_185, %cst_193, %cst_184, %4, %cst_244) : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x114x114x32xf32>, tensor<3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:1x114x114x32xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x32xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c112_265 = constant 112 : index
%c32_266 = constant 32 : index
%94 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c112_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c112_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c32_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 114)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 114)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x114x114x32xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x32xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%6 = flow.dispatch.workgroups[%c16, %c112, %c112](%cst_186, %cst_188, %cst_194, %cst_187, %5, %cst_189) : (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<1x112x112x32xf32>, tensor<1x1x32x16xf32>) -> tensor<1x112x112x16xf32> =
(%arg1: !flow.dispatch.tensor<readonly:16xf32>, %arg2: !flow.dispatch.tensor<readonly:16xf32>, %arg3: !flow.dispatch.tensor<readonly:16xf32>, %arg4: !flow.dispatch.tensor<readonly:16xf32>, %arg5: !flow.dispatch.tensor<readonly:1x112x112x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x16xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x16xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c112_263 = constant 112 : index
%c16_264 = constant 16 : index
%94 = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c112_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c112_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c16_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x112x112x16xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x?x?x32xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 16, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x16xf32> -> tensor<1x1x32x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 16, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x112x112x16xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
}
}
}
flow.return
}
%7 = flow.dispatch.workgroups[%c96, %c112, %c112](%cst_79, %cst_81, %cst_195, %cst_80, %6, %cst_82) : (tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x112x112x16xf32>, tensor<1x1x16x96xf32>) -> tensor<1x112x112x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:1x112x112x16xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x16x96xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x96xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c112_265 = constant 112 : index
%c96_266 = constant 96 : index
%94 = linalg.init_tensor [1, 112, 112, 96] : tensor<1x112x112x96xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c112_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c112_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c96_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x112x112x96xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x16xf32> -> tensor<1x?x?x16xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 16, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x16x96xf32> -> tensor<1x1x16x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x112x112x96xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x16xf32>, tensor<1x1x16x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x96xf32>
}
}
}
flow.return
}
%8 = flow.dispatch.workgroups[%c96, %c113, %c113]() : () -> tensor<1x113x113x96xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x113x113x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 113, 113, 96] : tensor<1x113x113x96xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x113x113x96xf32>, f32 -> tensor<1x113x113x96xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x113x113x96xf32> -> !flow.dispatch.tensor<writeonly:1x113x113x96xf32>
flow.return
}
%9 = flow.dispatch.workgroups[%c96, %c113, %c113](%7, %8) : (tensor<1x112x112x96xf32>, tensor<1x113x113x96xf32>) -> %8 =
(%arg1: !flow.dispatch.tensor<readonly:1x112x112x96xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x113x113x96xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x112x112x96xf32> -> tensor<1x112x112x96xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x113x113x96xf32> -> tensor<1x113x113x96xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 112, 112, 96] [1, 1, 1, 1] : tensor<1x112x112x96xf32> into tensor<1x113x113x96xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x113x113x96xf32> -> !flow.dispatch.tensor<readwrite:1x113x113x96xf32>
flow.return
}
%10 = flow.dispatch.workgroups[%c96, %c56, %c56](%cst_76, %cst_78, %cst_196, %cst_77, %9, %cst_245) : (tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:1x113x113x96xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x96xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x96xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c96_266 = constant 96 : index
%94 = linalg.init_tensor [1, 56, 56, 96] : tensor<1x56x56x96xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c56_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c56_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c96_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x56x56x96xf32> to tensor<1x?x?x?xf32>
%115 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%116 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 113)>(%101, %arg8)
%117 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%118 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 113)>(%102, %arg9)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg5, offsets = [0, %115, %117, %arg10], sizes = [1, %116, %118, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x113x113x96xf32> -> tensor<1x?x?x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%122 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %121], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x96xf32> -> tensor<3x3x?xf32>
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%125 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%126 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %123, %124, %125] [1, 1, 1, 1] : tensor<1x56x56x96xf32> to tensor<1x?x?x?xf32>
%127 = linalg.fill(%126, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%128 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%120, %122 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%127 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%129 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%128, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%132 = subf %arg11, %arg12 : f32
%133 = mulf %132, %arg13 : f32
%134 = divf %133, %arg14 : f32
%135 = addf %134, %arg15 : f32
%136 = cmpf olt, %135, %cst_263 : f32
%137 = select %136, %135, %cst_263 : f32
%138 = cmpf uno, %135, %cst_263 : f32
%139 = select %138, %cst_262, %137 : f32
%140 = cmpf ogt, %139, %cst_264 : f32
%141 = select %140, %139, %cst_264 : f32
%142 = cmpf uno, %139, %cst_264 : f32
%143 = select %142, %cst_262, %141 : f32
linalg.yield %143 : f32
} -> tensor<1x?x?x?xf32>
%130 = tensor.cast %129 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%131 = tensor.cast %130 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %131, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x96xf32>
}
}
}
flow.return
}
%11 = flow.dispatch.workgroups[%c24, %c56, %c56](%cst_83, %cst_85, %cst_197, %cst_84, %10, %cst_86) : (tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<1x56x56x96xf32>, tensor<1x1x96x24xf32>) -> tensor<1x56x56x24xf32> =
(%arg1: !flow.dispatch.tensor<readonly:24xf32>, %arg2: !flow.dispatch.tensor<readonly:24xf32>, %arg3: !flow.dispatch.tensor<readonly:24xf32>, %arg4: !flow.dispatch.tensor<readonly:24xf32>, %arg5: !flow.dispatch.tensor<readonly:1x56x56x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x24xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x24xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c56_263 = constant 56 : index
%c24_264 = constant 24 : index
%94 = linalg.init_tensor [1, 56, 56, 24] : tensor<1x56x56x24xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c56_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c56_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c24_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x56x56x24xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x96xf32> -> tensor<1x?x?x96xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x24xf32> -> tensor<1x1x96x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x56x56x24xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x24xf32>
}
}
}
flow.return
}
%12 = flow.dispatch.workgroups[%c144, %c56, %c56](%cst_90, %cst_92, %cst_198, %cst_91, %11, %cst_93) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x56x56x24xf32>, tensor<1x1x24x144xf32>) -> tensor<1x56x56x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x56x56x24xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x24x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c144_266 = constant 144 : index
%94 = linalg.init_tensor [1, 56, 56, 144] : tensor<1x56x56x144xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c56_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c56_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c144_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x56x56x144xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 24], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x24xf32> -> tensor<1x?x?x24xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 24, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x24x144xf32> -> tensor<1x1x24x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x56x56x144xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x24xf32>, tensor<1x1x24x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x144xf32>
}
}
}
flow.return
}
%13 = flow.dispatch.workgroups[%c144, %c58, %c58]() : () -> tensor<1x58x58x144xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x58x58x144xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 58, 58, 144] : tensor<1x58x58x144xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x58x58x144xf32>, f32 -> tensor<1x58x58x144xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x58x58x144xf32> -> !flow.dispatch.tensor<writeonly:1x58x58x144xf32>
flow.return
}
%14 = flow.dispatch.workgroups[%c144, %c58, %c58](%12, %13) : (tensor<1x56x56x144xf32>, tensor<1x58x58x144xf32>) -> %13 =
(%arg1: !flow.dispatch.tensor<readonly:1x56x56x144xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x58x58x144xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x56x56x144xf32> -> tensor<1x56x56x144xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x58x58x144xf32> -> tensor<1x58x58x144xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 56, 56, 144] [1, 1, 1, 1] : tensor<1x56x56x144xf32> into tensor<1x58x58x144xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x58x58x144xf32> -> !flow.dispatch.tensor<readwrite:1x58x58x144xf32>
flow.return
}
%15 = flow.dispatch.workgroups[%c144, %c56, %c56](%cst_87, %cst_89, %cst_199, %cst_88, %14, %cst_246) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x58x58x144xf32>, tensor<3x3x144xf32>) -> tensor<1x56x56x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x58x58x144xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c144_266 = constant 144 : index
%94 = linalg.init_tensor [1, 56, 56, 144] : tensor<1x56x56x144xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c56_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c56_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c144_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x56x56x144xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 58)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 58)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x58x58x144xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x56x56x144xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x144xf32>
}
}
}
flow.return
}
%16 = flow.dispatch.workgroups[%c24, %c56, %c56](%11, %cst_94, %cst_96, %cst_200, %cst_95, %15, %cst_97) : (tensor<1x56x56x24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<1x56x56x144xf32>, tensor<1x1x144x24xf32>) -> tensor<1x56x56x24xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x56x56x24xf32>, %arg2: !flow.dispatch.tensor<readonly:24xf32>, %arg3: !flow.dispatch.tensor<readonly:24xf32>, %arg4: !flow.dispatch.tensor<readonly:24xf32>, %arg5: !flow.dispatch.tensor<readonly:24xf32>, %arg6: !flow.dispatch.tensor<readonly:1x56x56x144xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x144x24xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x56x56x24xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c56_263 = constant 56 : index
%c24_264 = constant 24 : index
%94 = linalg.init_tensor [1, 56, 56, 24] : tensor<1x56x56x24xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c56_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c56_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c24_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x24xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x56x56x24xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x144xf32> -> tensor<1x?x?x144xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 144, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x144x24xf32> -> tensor<1x1x144x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x56x56x24xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x144xf32>, tensor<1x1x144x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x24xf32>
}
}
}
flow.return
}
%17 = flow.dispatch.workgroups[%c144, %c56, %c56](%cst_101, %cst_103, %cst_201, %cst_102, %16, %cst_104) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x56x56x24xf32>, tensor<1x1x24x144xf32>) -> tensor<1x56x56x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x56x56x24xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x24x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c144_266 = constant 144 : index
%94 = linalg.init_tensor [1, 56, 56, 144] : tensor<1x56x56x144xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c56_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c56_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c144_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x56x56x144xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 24], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x24xf32> -> tensor<1x?x?x24xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 24, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x24x144xf32> -> tensor<1x1x24x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x56x56x144xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x24xf32>, tensor<1x1x24x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x144xf32>
}
}
}
flow.return
}
%18 = flow.dispatch.workgroups[%c144, %c57, %c57]() : () -> tensor<1x57x57x144xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x57x57x144xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 57, 57, 144] : tensor<1x57x57x144xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x57x57x144xf32>, f32 -> tensor<1x57x57x144xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x57x57x144xf32> -> !flow.dispatch.tensor<writeonly:1x57x57x144xf32>
flow.return
}
%19 = flow.dispatch.workgroups[%c144, %c57, %c57](%17, %18) : (tensor<1x56x56x144xf32>, tensor<1x57x57x144xf32>) -> %18 =
(%arg1: !flow.dispatch.tensor<readonly:1x56x56x144xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x57x57x144xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x56x56x144xf32> -> tensor<1x56x56x144xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x57x57x144xf32> -> tensor<1x57x57x144xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 56, 56, 144] [1, 1, 1, 1] : tensor<1x56x56x144xf32> into tensor<1x57x57x144xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x57x57x144xf32> -> !flow.dispatch.tensor<readwrite:1x57x57x144xf32>
flow.return
}
%20 = flow.dispatch.workgroups[%c144, %c28, %c28](%cst_98, %cst_100, %cst_202, %cst_99, %19, %cst_247) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) -> tensor<1x28x28x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x57x57x144xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c144_266 = constant 144 : index
%94 = linalg.init_tensor [1, 28, 28, 144] : tensor<1x28x28x144xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c144_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x28x28x144xf32> to tensor<1x?x?x?xf32>
%115 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%116 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 57)>(%101, %arg8)
%117 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%118 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 57)>(%102, %arg9)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg5, offsets = [0, %115, %117, %arg10], sizes = [1, %116, %118, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x57x57x144xf32> -> tensor<1x?x?x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%122 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %121], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x?xf32>
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%125 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%126 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %123, %124, %125] [1, 1, 1, 1] : tensor<1x28x28x144xf32> to tensor<1x?x?x?xf32>
%127 = linalg.fill(%126, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%128 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%120, %122 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%127 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%129 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%128, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%132 = subf %arg11, %arg12 : f32
%133 = mulf %132, %arg13 : f32
%134 = divf %133, %arg14 : f32
%135 = addf %134, %arg15 : f32
%136 = cmpf olt, %135, %cst_263 : f32
%137 = select %136, %135, %cst_263 : f32
%138 = cmpf uno, %135, %cst_263 : f32
%139 = select %138, %cst_262, %137 : f32
%140 = cmpf ogt, %139, %cst_264 : f32
%141 = select %140, %139, %cst_264 : f32
%142 = cmpf uno, %139, %cst_264 : f32
%143 = select %142, %cst_262, %141 : f32
linalg.yield %143 : f32
} -> tensor<1x?x?x?xf32>
%130 = tensor.cast %129 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%131 = tensor.cast %130 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %131, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x144xf32>
}
}
}
flow.return
}
%21 = flow.dispatch.workgroups[%c32, %c28, %c28](%cst_105, %cst_107, %cst_203, %cst_106, %20, %cst_108) : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x28x28x144xf32>, tensor<1x1x144x32xf32>) -> tensor<1x28x28x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x144xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x144x32xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c28_263 = constant 28 : index
%c32_264 = constant 32 : index
%94 = linalg.init_tensor [1, 28, 28, 32] : tensor<1x28x28x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c32_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x28x28x32xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x144xf32> -> tensor<1x?x?x144xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 144, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x144x32xf32> -> tensor<1x1x144x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x28x28x32xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x144xf32>, tensor<1x1x144x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x32xf32>
}
}
}
flow.return
}
%22 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_112, %cst_114, %cst_204, %cst_113, %21, %cst_115) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%94 = linalg.init_tensor [1, 28, 28, 192] : tensor<1x28x28x192xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c192_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x32xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x192xf32> -> tensor<1x1x32x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%23 = flow.dispatch.workgroups[%c192, %c30, %c30]() : () -> tensor<1x30x30x192xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x30x30x192xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 30, 30, 192] : tensor<1x30x30x192xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x30x30x192xf32>, f32 -> tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<writeonly:1x30x30x192xf32>
flow.return
}
%24 = flow.dispatch.workgroups[%c192, %c30, %c30](%22, %23) : (tensor<1x28x28x192xf32>, tensor<1x30x30x192xf32>) -> %23 =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x30x30x192xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x28x28x192xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x30x30x192xf32> -> tensor<1x30x30x192xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<readwrite:1x30x30x192xf32>
flow.return
}
%25 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_109, %cst_111, %cst_205, %cst_110, %24, %cst_248) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x30x30x192xf32>, tensor<3x3x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x30x30x192xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%94 = linalg.init_tensor [1, 28, 28, 192] : tensor<1x28x28x192xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c192_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x30x30x192xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x192xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%26 = flow.dispatch.workgroups[%c32, %c28, %c28](%21, %cst_116, %cst_118, %cst_206, %cst_117, %25, %cst_119) : (tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x28x28x192xf32>, tensor<1x1x192x32xf32>) -> tensor<1x28x28x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x192x32xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x28x28x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c28_263 = constant 28 : index
%c32_264 = constant 32 : index
%94 = linalg.init_tensor [1, 28, 28, 32] : tensor<1x28x28x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c28_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c28_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c32_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x28x28x32xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 192], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x?x?x192xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 192, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x192x32xf32> -> tensor<1x1x192x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x28x28x32xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x192xf32>, tensor<1x1x192x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x32xf32>
}
}
}
flow.return
}
%27 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_123, %cst_125, %cst_207, %cst_124, %26, %cst_126) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%94 = linalg.init_tensor [1, 28, 28, 192] : tensor<1x28x28x192xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c192_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x32xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x192xf32> -> tensor<1x1x32x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%28 = flow.dispatch.workgroups[%c192, %c30, %c30]() : () -> tensor<1x30x30x192xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x30x30x192xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 30, 30, 192] : tensor<1x30x30x192xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x30x30x192xf32>, f32 -> tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<writeonly:1x30x30x192xf32>
flow.return
}
%29 = flow.dispatch.workgroups[%c192, %c30, %c30](%27, %28) : (tensor<1x28x28x192xf32>, tensor<1x30x30x192xf32>) -> %28 =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x30x30x192xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x28x28x192xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x30x30x192xf32> -> tensor<1x30x30x192xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<readwrite:1x30x30x192xf32>
flow.return
}
%30 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_120, %cst_122, %cst_208, %cst_121, %29, %cst_249) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x30x30x192xf32>, tensor<3x3x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x30x30x192xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%94 = linalg.init_tensor [1, 28, 28, 192] : tensor<1x28x28x192xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c192_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x30x30x192xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x192xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%31 = flow.dispatch.workgroups[%c32, %c28, %c28](%26, %cst_127, %cst_129, %cst_209, %cst_128, %30, %cst_130) : (tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x28x28x192xf32>, tensor<1x1x192x32xf32>) -> tensor<1x28x28x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x192x32xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x28x28x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c28_263 = constant 28 : index
%c32_264 = constant 32 : index
%94 = linalg.init_tensor [1, 28, 28, 32] : tensor<1x28x28x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c28_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c28_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c32_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x28x28x32xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 192], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x?x?x192xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 192, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x192x32xf32> -> tensor<1x1x192x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x28x28x32xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x192xf32>, tensor<1x1x192x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x32xf32>
}
}
}
flow.return
}
%32 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_134, %cst_136, %cst_210, %cst_135, %31, %cst_137) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%94 = linalg.init_tensor [1, 28, 28, 192] : tensor<1x28x28x192xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c28_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c28_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c192_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x32xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x192xf32> -> tensor<1x1x32x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x28x28x192xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%33 = flow.dispatch.workgroups[%c192, %c29, %c29]() : () -> tensor<1x29x29x192xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x29x29x192xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 29, 29, 192] : tensor<1x29x29x192xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x29x29x192xf32>, f32 -> tensor<1x29x29x192xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x29x29x192xf32> -> !flow.dispatch.tensor<writeonly:1x29x29x192xf32>
flow.return
}
%34 = flow.dispatch.workgroups[%c192, %c29, %c29](%32, %33) : (tensor<1x28x28x192xf32>, tensor<1x29x29x192xf32>) -> %33 =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x29x29x192xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x28x28x192xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x29x29x192xf32> -> tensor<1x29x29x192xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x29x29x192xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x29x29x192xf32> -> !flow.dispatch.tensor<readwrite:1x29x29x192xf32>
flow.return
}
%35 = flow.dispatch.workgroups[%c192, %c14, %c14](%cst_131, %cst_133, %cst_211, %cst_132, %34, %cst_250) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x29x29x192xf32>, tensor<3x3x192xf32>) -> tensor<1x14x14x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x29x29x192xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c192_266 = constant 192 : index
%94 = linalg.init_tensor [1, 14, 14, 192] : tensor<1x14x14x192xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c192_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x192xf32> to tensor<1x?x?x?xf32>
%115 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%116 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 29)>(%101, %arg8)
%117 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%118 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 29)>(%102, %arg9)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg5, offsets = [0, %115, %117, %arg10], sizes = [1, %116, %118, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x29x29x192xf32> -> tensor<1x?x?x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%122 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %121], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x192xf32> -> tensor<3x3x?xf32>
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%125 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%126 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %123, %124, %125] [1, 1, 1, 1] : tensor<1x14x14x192xf32> to tensor<1x?x?x?xf32>
%127 = linalg.fill(%126, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%128 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%120, %122 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%127 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%129 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%128, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%132 = subf %arg11, %arg12 : f32
%133 = mulf %132, %arg13 : f32
%134 = divf %133, %arg14 : f32
%135 = addf %134, %arg15 : f32
%136 = cmpf olt, %135, %cst_263 : f32
%137 = select %136, %135, %cst_263 : f32
%138 = cmpf uno, %135, %cst_263 : f32
%139 = select %138, %cst_262, %137 : f32
%140 = cmpf ogt, %139, %cst_264 : f32
%141 = select %140, %139, %cst_264 : f32
%142 = cmpf uno, %139, %cst_264 : f32
%143 = select %142, %cst_262, %141 : f32
linalg.yield %143 : f32
} -> tensor<1x?x?x?xf32>
%130 = tensor.cast %129 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%131 = tensor.cast %130 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %131, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x192xf32>
}
}
}
flow.return
}
%36 = flow.dispatch.workgroups[%c64, %c14, %c14](%cst_138, %cst_140, %cst_212, %cst_139, %35, %cst_141) : (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x192xf32>, tensor<1x1x192x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x192xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x192x64xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%94 = linalg.init_tensor [1, 14, 14, 64] : tensor<1x14x14x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c64_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 192], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x192xf32> -> tensor<1x?x?x192xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 192, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x192x64xf32> -> tensor<1x1x192x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x192xf32>, tensor<1x1x192x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%37 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_145, %cst_147, %cst_213, %cst_146, %36, %cst_148) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%38 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%39 = flow.dispatch.workgroups[%c384, %c16, %c16](%37, %38) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %38 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%40 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_142, %cst_144, %cst_214, %cst_143, %39, %cst_251) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%41 = flow.dispatch.workgroups[%c64, %c14, %c14](%36, %cst_149, %cst_151, %cst_215, %cst_150, %40, %cst_152) : (tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x384x64xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%94 = linalg.init_tensor [1, 14, 14, 64] : tensor<1x14x14x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c64_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 384, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x64xf32> -> tensor<1x1x384x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%42 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_156, %cst_158, %cst_216, %cst_157, %41, %cst_159) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%43 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%44 = flow.dispatch.workgroups[%c384, %c16, %c16](%42, %43) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %43 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%45 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_153, %cst_155, %cst_217, %cst_154, %44, %cst_252) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%46 = flow.dispatch.workgroups[%c64, %c14, %c14](%41, %cst_160, %cst_162, %cst_218, %cst_161, %45, %cst_163) : (tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x384x64xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%94 = linalg.init_tensor [1, 14, 14, 64] : tensor<1x14x14x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c64_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 384, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x64xf32> -> tensor<1x1x384x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%47 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_167, %cst_169, %cst_219, %cst_168, %46, %cst_170) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%48 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%49 = flow.dispatch.workgroups[%c384, %c16, %c16](%47, %48) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %48 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%50 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_164, %cst_166, %cst_220, %cst_165, %49, %cst_253) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%51 = flow.dispatch.workgroups[%c64, %c14, %c14](%46, %cst_171, %cst_173, %cst_221, %cst_172, %50, %cst_174) : (tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x384x64xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%94 = linalg.init_tensor [1, 14, 14, 64] : tensor<1x14x14x64xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c64_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 384, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x64xf32> -> tensor<1x1x384x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x14x14x64xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%52 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_2, %cst_4, %cst_222, %cst_3, %51, %cst_5) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%53 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%54 = flow.dispatch.workgroups[%c384, %c16, %c16](%52, %53) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %53 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%55 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst, %cst_1, %cst_223, %cst_0, %54, %cst_254) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%94 = linalg.init_tensor [1, 14, 14, 384] : tensor<1x14x14x384xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c384_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x14x14x384xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%56 = flow.dispatch.workgroups[%c96, %c14, %c14](%cst_6, %cst_8, %cst_224, %cst_7, %55, %cst_9) : (tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x96xf32>) -> tensor<1x14x14x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x384x96xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c96_264 = constant 96 : index
%94 = linalg.init_tensor [1, 14, 14, 96] : tensor<1x14x14x96xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c96_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x96xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 384, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x96xf32> -> tensor<1x1x384x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x96xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x96xf32>
}
}
}
flow.return
}
%57 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_13, %cst_15, %cst_225, %cst_14, %56, %cst_16) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%94 = linalg.init_tensor [1, 14, 14, 576] : tensor<1x14x14x576xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c576_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x96xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x576xf32> -> tensor<1x1x96x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%58 = flow.dispatch.workgroups[%c576, %c16, %c16]() : () -> tensor<1x16x16x576xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x576xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 576] : tensor<1x16x16x576xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x576xf32>, f32 -> tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x576xf32>
flow.return
}
%59 = flow.dispatch.workgroups[%c576, %c16, %c16](%57, %58) : (tensor<1x14x14x576xf32>, tensor<1x16x16x576xf32>) -> %58 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x576xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x14x14x576xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x576xf32> -> tensor<1x16x16x576xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x576xf32>
flow.return
}
%60 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_10, %cst_12, %cst_226, %cst_11, %59, %cst_255) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x16x16x576xf32>, tensor<3x3x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x576xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%94 = linalg.init_tensor [1, 14, 14, 576] : tensor<1x14x14x576xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c576_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x576xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x576xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%61 = flow.dispatch.workgroups[%c96, %c14, %c14](%56, %cst_17, %cst_19, %cst_227, %cst_18, %60, %cst_20) : (tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x14x14x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x14x14x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x576x96xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c96_264 = constant 96 : index
%94 = linalg.init_tensor [1, 14, 14, 96] : tensor<1x14x14x96xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c96_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x14x14x96xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x?x?x576xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 576, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x576x96xf32> -> tensor<1x1x576x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x14x14x96xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x576xf32>, tensor<1x1x576x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x96xf32>
}
}
}
flow.return
}
%62 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_24, %cst_26, %cst_228, %cst_25, %61, %cst_27) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%94 = linalg.init_tensor [1, 14, 14, 576] : tensor<1x14x14x576xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c576_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x96xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x576xf32> -> tensor<1x1x96x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%63 = flow.dispatch.workgroups[%c576, %c16, %c16]() : () -> tensor<1x16x16x576xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x576xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 576] : tensor<1x16x16x576xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x576xf32>, f32 -> tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x576xf32>
flow.return
}
%64 = flow.dispatch.workgroups[%c576, %c16, %c16](%62, %63) : (tensor<1x14x14x576xf32>, tensor<1x16x16x576xf32>) -> %63 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x576xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x14x14x576xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x576xf32> -> tensor<1x16x16x576xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x576xf32>
flow.return
}
%65 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_21, %cst_23, %cst_229, %cst_22, %64, %cst_256) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x16x16x576xf32>, tensor<3x3x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x576xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%94 = linalg.init_tensor [1, 14, 14, 576] : tensor<1x14x14x576xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c576_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x576xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x576xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%66 = flow.dispatch.workgroups[%c96, %c14, %c14](%61, %cst_28, %cst_30, %cst_230, %cst_29, %65, %cst_31) : (tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x14x14x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x14x14x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x576x96xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c96_264 = constant 96 : index
%94 = linalg.init_tensor [1, 14, 14, 96] : tensor<1x14x14x96xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c14_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c14_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c96_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x14x14x96xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x?x?x576xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 576, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x576x96xf32> -> tensor<1x1x576x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x14x14x96xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x576xf32>, tensor<1x1x576x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x96xf32>
}
}
}
flow.return
}
%67 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_35, %cst_37, %cst_231, %cst_36, %66, %cst_38) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%94 = linalg.init_tensor [1, 14, 14, 576] : tensor<1x14x14x576xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c14_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c14_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c576_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x96xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x576xf32> -> tensor<1x1x96x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x14x14x576xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%68 = flow.dispatch.workgroups[%c576, %c15, %c15]() : () -> tensor<1x15x15x576xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x15x15x576xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 15, 15, 576] : tensor<1x15x15x576xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x15x15x576xf32>, f32 -> tensor<1x15x15x576xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x15x15x576xf32> -> !flow.dispatch.tensor<writeonly:1x15x15x576xf32>
flow.return
}
%69 = flow.dispatch.workgroups[%c576, %c15, %c15](%67, %68) : (tensor<1x14x14x576xf32>, tensor<1x15x15x576xf32>) -> %68 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x15x15x576xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x14x14x576xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x15x15x576xf32> -> tensor<1x15x15x576xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x15x15x576xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x15x15x576xf32> -> !flow.dispatch.tensor<readwrite:1x15x15x576xf32>
flow.return
}
%70 = flow.dispatch.workgroups[%c576, %c7, %c7](%cst_32, %cst_34, %cst_232, %cst_33, %69, %cst_257) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x15x15x576xf32>, tensor<3x3x576xf32>) -> tensor<1x7x7x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x15x15x576xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c576_266 = constant 576 : index
%94 = linalg.init_tensor [1, 7, 7, 576] : tensor<1x7x7x576xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c576_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x7x7x576xf32> to tensor<1x?x?x?xf32>
%115 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%116 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 15)>(%101, %arg8)
%117 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%118 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 15)>(%102, %arg9)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg5, offsets = [0, %115, %117, %arg10], sizes = [1, %116, %118, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x15x15x576xf32> -> tensor<1x?x?x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%122 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %121], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x576xf32> -> tensor<3x3x?xf32>
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%125 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%126 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %123, %124, %125] [1, 1, 1, 1] : tensor<1x7x7x576xf32> to tensor<1x?x?x?xf32>
%127 = linalg.fill(%126, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%128 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%120, %122 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%127 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%129 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%128, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%132 = subf %arg11, %arg12 : f32
%133 = mulf %132, %arg13 : f32
%134 = divf %133, %arg14 : f32
%135 = addf %134, %arg15 : f32
%136 = cmpf olt, %135, %cst_263 : f32
%137 = select %136, %135, %cst_263 : f32
%138 = cmpf uno, %135, %cst_263 : f32
%139 = select %138, %cst_262, %137 : f32
%140 = cmpf ogt, %139, %cst_264 : f32
%141 = select %140, %139, %cst_264 : f32
%142 = cmpf uno, %139, %cst_264 : f32
%143 = select %142, %cst_262, %141 : f32
linalg.yield %143 : f32
} -> tensor<1x?x?x?xf32>
%130 = tensor.cast %129 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%131 = tensor.cast %130 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %131, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x576xf32>
}
}
}
flow.return
}
%71 = flow.dispatch.workgroups[%c160, %c7, %c7](%cst_39, %cst_41, %cst_233, %cst_40, %70, %cst_42) : (tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<1x7x7x576xf32>, tensor<1x1x576x160xf32>) -> tensor<1x7x7x160xf32> =
(%arg1: !flow.dispatch.tensor<readonly:160xf32>, %arg2: !flow.dispatch.tensor<readonly:160xf32>, %arg3: !flow.dispatch.tensor<readonly:160xf32>, %arg4: !flow.dispatch.tensor<readonly:160xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x576xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x576x160xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x160xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c160_264 = constant 160 : index
%94 = linalg.init_tensor [1, 7, 7, 160] : tensor<1x7x7x160xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c160_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x7x7x160xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x576xf32> -> tensor<1x?x?x576xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 576, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x576x160xf32> -> tensor<1x1x576x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x7x7x160xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x576xf32>, tensor<1x1x576x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x160xf32>
}
}
}
flow.return
}
%72 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_46, %cst_48, %cst_234, %cst_47, %71, %cst_49) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x160x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%94 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c960_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 160], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x160xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 160, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x160x960xf32> -> tensor<1x1x160x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x160xf32>, tensor<1x1x160x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%73 = flow.dispatch.workgroups[%c960, %c9, %c9]() : () -> tensor<1x9x9x960xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x9x9x960xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<writeonly:1x9x9x960xf32>
flow.return
}
%74 = flow.dispatch.workgroups[%c960, %c9, %c9](%72, %73) : (tensor<1x7x7x960xf32>, tensor<1x9x9x960xf32>) -> %73 =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x9x9x960xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x7x7x960xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x9x9x960xf32> -> tensor<1x9x9x960xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<readwrite:1x9x9x960xf32>
flow.return
}
%75 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_43, %cst_45, %cst_235, %cst_44, %74, %cst_258) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x9x9x960xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%94 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c960_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x9x9x960xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x960xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%76 = flow.dispatch.workgroups[%c160, %c7, %c7](%71, %cst_50, %cst_52, %cst_236, %cst_51, %75, %cst_53) : (tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<1x7x7x960xf32>, tensor<1x1x960x160xf32>) -> tensor<1x7x7x160xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg2: !flow.dispatch.tensor<readonly:160xf32>, %arg3: !flow.dispatch.tensor<readonly:160xf32>, %arg4: !flow.dispatch.tensor<readonly:160xf32>, %arg5: !flow.dispatch.tensor<readonly:160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x960x160xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x7x7x160xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c160_264 = constant 160 : index
%94 = linalg.init_tensor [1, 7, 7, 160] : tensor<1x7x7x160xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c7_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c7_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c160_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x7x7x160xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 960], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x?x?x960xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 960, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x960x160xf32> -> tensor<1x1x960x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x7x7x160xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x960xf32>, tensor<1x1x960x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x160xf32>
}
}
}
flow.return
}
%77 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_57, %cst_59, %cst_237, %cst_58, %76, %cst_60) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x160x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%94 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c960_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 160], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x160xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 160, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x160x960xf32> -> tensor<1x1x160x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x160xf32>, tensor<1x1x160x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%78 = flow.dispatch.workgroups[%c960, %c9, %c9]() : () -> tensor<1x9x9x960xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x9x9x960xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<writeonly:1x9x9x960xf32>
flow.return
}
%79 = flow.dispatch.workgroups[%c960, %c9, %c9](%77, %78) : (tensor<1x7x7x960xf32>, tensor<1x9x9x960xf32>) -> %78 =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x9x9x960xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x7x7x960xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x9x9x960xf32> -> tensor<1x9x9x960xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<readwrite:1x9x9x960xf32>
flow.return
}
%80 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_54, %cst_56, %cst_238, %cst_55, %79, %cst_259) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x9x9x960xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%94 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c960_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x9x9x960xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x960xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%81 = flow.dispatch.workgroups[%c160, %c7, %c7](%76, %cst_61, %cst_63, %cst_239, %cst_62, %80, %cst_64) : (tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<1x7x7x960xf32>, tensor<1x1x960x160xf32>) -> tensor<1x7x7x160xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg2: !flow.dispatch.tensor<readonly:160xf32>, %arg3: !flow.dispatch.tensor<readonly:160xf32>, %arg4: !flow.dispatch.tensor<readonly:160xf32>, %arg5: !flow.dispatch.tensor<readonly:160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x960x160xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x7x7x160xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c160_264 = constant 160 : index
%94 = linalg.init_tensor [1, 7, 7, 160] : tensor<1x7x7x160xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %95 to %c7_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %97 to %c7_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %99 to %c160_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %101, %102, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%112 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%111], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%115 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%116 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %113, %114, %115] [1, 1, 1, 1] : tensor<1x7x7x160xf32> to tensor<1x?x?x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %117, %118, 960], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x?x?x960xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 960, %120], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x960x160xf32> -> tensor<1x1x960x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%125 = subtensor %94[0, %arg9, %arg10, %arg11] [1, %122, %123, %124] [1, 1, 1, 1] : tensor<1x7x7x160xf32> to tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x960xf32>, tensor<1x1x960x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %127, %106, %108, %110, %112 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%116 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%131 = subf %arg13, %arg14 : f32
%132 = mulf %131, %arg15 : f32
%133 = divf %132, %arg16 : f32
%134 = addf %133, %arg17 : f32
%135 = addf %arg12, %134 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
%129 = tensor.cast %128 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%130 = tensor.cast %129 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %130, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %113, %114, %115], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x160xf32>
}
}
}
flow.return
}
%82 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_68, %cst_70, %cst_240, %cst_69, %81, %cst_71) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x160x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%94 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c960_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 160], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x160xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 160, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x160x960xf32> -> tensor<1x1x160x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x160xf32>, tensor<1x1x160x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%83 = flow.dispatch.workgroups[%c960, %c9, %c9]() : () -> tensor<1x9x9x960xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x9x9x960xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<writeonly:1x9x9x960xf32>
flow.return
}
%84 = flow.dispatch.workgroups[%c960, %c9, %c9](%82, %83) : (tensor<1x7x7x960xf32>, tensor<1x9x9x960xf32>) -> %83 =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x9x9x960xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x7x7x960xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x9x9x960xf32> -> tensor<1x9x9x960xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<readwrite:1x9x9x960xf32>
flow.return
}
%85 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_65, %cst_67, %cst_241, %cst_66, %84, %cst_260) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x9x9x960xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%94 = linalg.init_tensor [1, 7, 7, 960] : tensor<1x7x7x960xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_265 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_265 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c960_266 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%110 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%109], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%114 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %111, %112, %113] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg8, %101)
%116 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg9, %102)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %115, %116, %117], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x9x9x960xf32> -> tensor<1x?x?x?xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %119], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x960xf32> -> tensor<3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%124 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %121, %122, %123] [1, 1, 1, 1] : tensor<1x7x7x960xf32> to tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %104, %106, %108, %110 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%114 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%130 = subf %arg11, %arg12 : f32
%131 = mulf %130, %arg13 : f32
%132 = divf %131, %arg14 : f32
%133 = addf %132, %arg15 : f32
%134 = cmpf olt, %133, %cst_263 : f32
%135 = select %134, %133, %cst_263 : f32
%136 = cmpf uno, %133, %cst_263 : f32
%137 = select %136, %cst_262, %135 : f32
%138 = cmpf ogt, %137, %cst_264 : f32
%139 = select %138, %137, %cst_264 : f32
%140 = cmpf uno, %137, %cst_264 : f32
%141 = select %140, %cst_262, %139 : f32
linalg.yield %141 : f32
} -> tensor<1x?x?x?xf32>
%128 = tensor.cast %127 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%129 = tensor.cast %128 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %129, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %111, %112, %113], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%86 = flow.dispatch.workgroups[%c320, %c7, %c7](%cst_72, %cst_74, %cst_242, %cst_73, %85, %cst_75) : (tensor<320xf32>, tensor<320xf32>, tensor<320xf32>, tensor<320xf32>, tensor<1x7x7x960xf32>, tensor<1x1x960x320xf32>) -> tensor<1x7x7x320xf32> =
(%arg1: !flow.dispatch.tensor<readonly:320xf32>, %arg2: !flow.dispatch.tensor<readonly:320xf32>, %arg3: !flow.dispatch.tensor<readonly:320xf32>, %arg4: !flow.dispatch.tensor<readonly:320xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x960x320xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x320xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c320_264 = constant 320 : index
%94 = linalg.init_tensor [1, 7, 7, 320] : tensor<1x7x7x320xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %95 to %c7_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %97 to %c7_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %99 to %c320_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%112 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x7x7x320xf32> to tensor<1x?x?x?xf32>
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%114 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%115 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %113, %114, 960], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x?x?x960xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 320, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 960, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x960x320xf32> -> tensor<1x1x960x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 320, d1)>(%arg10, %workgroup_size_0)
%121 = subtensor %94[0, %arg8, %arg9, %arg10] [1, %118, %119, %120] [1, 1, 1, 1] : tensor<1x7x7x320xf32> to tensor<1x?x?x?xf32>
%122 = linalg.fill(%121, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%123 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%115, %117 : tensor<1x?x?x960xf32>, tensor<1x1x960x?xf32>) outs(%122 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%124 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%123, %102, %104, %106, %108 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
linalg.yield %130 : f32
} -> tensor<1x?x?x?xf32>
%125 = tensor.cast %124 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%126 = tensor.cast %125 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %109, %110, %111], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x320xf32>
}
}
}
flow.return
}
%87 = flow.dispatch.workgroups[%c1280, %c7, %c7](%86, %cst_182) : (tensor<1x7x7x320xf32>, tensor<1x1x320x1280xf32>) -> tensor<1x7x7x1280xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x320xf32>, %arg2: !flow.dispatch.tensor<readonly:1x1x320x1280xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x7x7x1280xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c1280_264 = constant 1280 : index
%94 = linalg.init_tensor [1, 7, 7, 1280] : tensor<1x7x7x1280xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg4 = %95 to %c7_263 step %96 {
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg5 = %97 to %c7_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg6 = %99 to %c1280_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg4, %workgroup_size_2)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg5, %workgroup_size_1)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4, %arg5, 0], sizes = [1, %101, %102, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x320xf32> -> tensor<1x?x?x320xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg6, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, %arg6], sizes = [1, 1, 320, %104], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x320x1280xf32> -> tensor<1x1x320x?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg4, %workgroup_size_2)
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg5, %workgroup_size_1)
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg6, %workgroup_size_0)
%109 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg4, %workgroup_size_2)
%110 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg5, %workgroup_size_1)
%111 = affine.min affine_map<(d0, d1) -> (-d0 + 1280, d1)>(%arg6, %workgroup_size_0)
%112 = subtensor %94[0, %arg4, %arg5, %arg6] [1, %109, %110, %111] [1, 1, 1, 1] : tensor<1x7x7x1280xf32> to tensor<1x?x?x?xf32>
%113 = linalg.fill(%112, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%114 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%103, %105 : tensor<1x?x?x320xf32>, tensor<1x1x320x?xf32>) outs(%113 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%115 = tensor.cast %114 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%116 = tensor.cast %115 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %116, %arg3, offsets = [0, %arg4, %arg5, %arg6], sizes = [1, %106, %107, %108], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x1280xf32>
}
}
}
flow.return
}
%88 = flow.dispatch.workgroups[%c1280, %c1, %c1](%87, %cst_179, %cst_181, %cst_243, %cst_180) : (tensor<1x7x7x1280xf32>, tensor<1280xf32>, tensor<1280xf32>, tensor<1280xf32>, tensor<1280xf32>) -> tensor<1280xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x1280xf32>, %arg2: !flow.dispatch.tensor<readonly:1280xf32>, %arg3: !flow.dispatch.tensor<readonly:1280xf32>, %arg4: !flow.dispatch.tensor<readonly:1280xf32>, %arg5: !flow.dispatch.tensor<readonly:1280xf32>, %arg6: !flow.dispatch.tensor<writeonly:1280xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c1280_265 = constant 1280 : index
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x1280xf32> -> tensor<1x7x7x1280xf32>
%95 = linalg.tensor_reshape %94 [affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d2)>, affine_map<(d0, d1, d2, d3) -> (d3)>] : tensor<1x7x7x1280xf32> into tensor<7x7x1280xf32>
%96 = linalg.init_tensor [1280] : tensor<1280xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %97 to %c1280_265 step %98 {
%99 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%100 = subtensor %95[0, 0, %arg7] [7, 7, %99] [1, 1, 1] : tensor<7x7x1280xf32> to tensor<7x7x?xf32>
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%102 = flow.dispatch.tensor.load %arg2, offsets = [%arg7], sizes = [%101], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg3, offsets = [%arg7], sizes = [%103], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg4, offsets = [%arg7], sizes = [%105], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%108 = flow.dispatch.tensor.load %arg5, offsets = [%arg7], sizes = [%107], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%110 = affine.min affine_map<(d0, d1) -> (-d0 + 1280, d1)>(%arg7, %workgroup_size_0)
%111 = subtensor %96[%arg7] [%110] [1] : tensor<1280xf32> to tensor<?xf32>
%112 = linalg.fill(%111, %cst_264) : tensor<?xf32>, f32 -> tensor<?xf32>
%113 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%100, %102, %104, %106, %108 : tensor<7x7x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%112 : tensor<?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32): // no predecessors
%114 = subf %arg8, %arg9 : f32
%115 = mulf %114, %arg10 : f32
%116 = divf %115, %arg11 : f32
%117 = addf %116, %arg12 : f32
%118 = cmpf olt, %117, %cst_263 : f32
%119 = select %118, %117, %cst_263 : f32
%120 = cmpf uno, %117, %cst_263 : f32
%121 = select %120, %cst_262, %119 : f32
%122 = cmpf ogt, %121, %cst_264 : f32
%123 = select %122, %121, %cst_264 : f32
%124 = cmpf uno, %121, %cst_264 : f32
%125 = select %124, %cst_262, %123 : f32
%126 = addf %125, %arg13 : f32
linalg.yield %126 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %113, %arg6, offsets = [%arg7], sizes = [%109], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:1280xf32>
}
flow.return
}
%89 = flow.dispatch.workgroups[%c1280, %c1, %c1](%88) : (tensor<1280xf32>) -> tensor<1280xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1280xf32>, %arg2: !flow.dispatch.tensor<writeonly:1280xf32>) {
%cst_262 = constant 4.900000e+01 : f32
%c1280_263 = constant 1280 : index
%94 = linalg.init_tensor [1280] : tensor<1280xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg3 = %95 to %c1280_263 step %96 {
%97 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg3, %workgroup_size_0)
%98 = flow.dispatch.tensor.load %arg1, offsets = [%arg3], sizes = [%97], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%99 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg3, %workgroup_size_0)
%100 = subtensor %94[%arg3] [%99] [1] : tensor<1280xf32> to tensor<?xf32>
%101 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%98 : tensor<?xf32>) outs(%100 : tensor<?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg4: f32, %arg5: f32): // no predecessors
%102 = divf %arg4, %cst_262 : f32
linalg.yield %102 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %101, %arg2, offsets = [%arg3], sizes = [%99], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:1280xf32>
}
flow.return
}
%90 = flow.dispatch.workgroups[%c1000, %c1, %c1](%89, %cst_191) : (tensor<1280xf32>, tensor<1280x1000xf32>) -> tensor<1x1000xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1280xf32>, %arg2: !flow.dispatch.tensor<readonly:1280x1000xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x1000xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c1_263 = constant 1 : index
%c1000_264 = constant 1000 : index
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<1280xf32>
%95 = linalg.tensor_reshape %94 [affine_map<(d0, d1) -> (d0, d1)>] : tensor<1280xf32> into tensor<1x1280xf32>
%96 = linalg.init_tensor [1, 1000] : tensor<1x1000xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %97 to %c1_263 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %99 to %c1000_264 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg4, %workgroup_size_1)
%102 = subtensor %95[%arg4, 0] [%101, 1280] [1, 1] : tensor<1x1280xf32> to tensor<?x1280xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1000)>(%arg5, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [0, %arg5], sizes = [1280, %103], strides = [1, 1] : !flow.dispatch.tensor<readonly:1280x1000xf32> -> tensor<1280x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg4, %workgroup_size_1)
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1000)>(%arg5, %workgroup_size_0)
%107 = affine.min affine_map<(d0, d1) -> (-d0 + 1, d1)>(%arg4, %workgroup_size_1)
%108 = affine.min affine_map<(d0, d1) -> (-d0 + 1000, d1)>(%arg5, %workgroup_size_0)
%109 = subtensor %96[%arg4, %arg5] [%107, %108] [1, 1] : tensor<1x1000xf32> to tensor<?x?xf32>
%110 = linalg.fill(%109, %cst_262) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
%111 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%102, %104 : tensor<?x1280xf32>, tensor<1280x?xf32>) outs(%110 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %111, %arg3, offsets = [%arg4, %arg5], sizes = [%105, %106], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1x1000xf32>
}
}
flow.return
}
%91 = flow.dispatch.workgroups[%c1, %c1, %c1](%90, %cst_190) : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<f32> =
(%arg1: !flow.dispatch.tensor<readonly:1x1000xf32>, %arg2: !flow.dispatch.tensor<readonly:1000xf32>, %arg3: !flow.dispatch.tensor<writeonly:f32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 0xFF800000 : f32
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x1000xf32> -> tensor<1x1000xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1000xf32> -> tensor<1000xf32>
%96 = linalg.tensor_reshape %94 [affine_map<(d0, d1) -> (d0, d1)>] : tensor<1x1000xf32> into tensor<1000xf32>
%97 = linalg.init_tensor [] : tensor<f32>
%98 = linalg.fill(%97, %cst_263) : tensor<f32>, f32 -> tensor<f32>
%99 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%96, %95 : tensor<1000xf32>, tensor<1000xf32>) outs(%98 : tensor<f32>) attrs = {__root_op__ = 55 : i64} {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%100 = addf %arg4, %arg5 : f32
%101 = cmpf ogt, %100, %arg6 : f32
%102 = select %101, %100, %arg6 : f32
%103 = cmpf uno, %100, %arg6 : f32
%104 = select %103, %cst_262, %102 : f32
linalg.yield %104 : f32
} -> tensor<f32>
flow.dispatch.tensor.store %99, %arg3, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
flow.return
}
%92 = flow.dispatch.workgroups[%c1, %c1, %c1](%90, %cst_190, %91) : (tensor<1x1000xf32>, tensor<1000xf32>, tensor<f32>) -> tensor<f32> =
(%arg1: !flow.dispatch.tensor<readonly:1x1000xf32>, %arg2: !flow.dispatch.tensor<readonly:1000xf32>, %arg3: !flow.dispatch.tensor<readonly:f32>, %arg4: !flow.dispatch.tensor<writeonly:f32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x1000xf32> -> tensor<1x1000xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1000xf32> -> tensor<1000xf32>
%96 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%97 = linalg.tensor_reshape %94 [affine_map<(d0, d1) -> (d0, d1)>] : tensor<1x1000xf32> into tensor<1000xf32>
%98 = linalg.init_tensor [] : tensor<f32>
%99 = linalg.fill(%98, %cst_262) : tensor<f32>, f32 -> tensor<f32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%97, %95, %96 : tensor<1000xf32>, tensor<1000xf32>, tensor<f32>) outs(%99 : tensor<f32>) attrs = {__root_op__ = 54 : i64} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32, %arg8: f32): // no predecessors
%101 = addf %arg5, %arg6 : f32
%102 = subf %101, %arg7 : f32
%103 = math.exp %102 : f32
%104 = addf %103, %arg8 : f32
linalg.yield %104 : f32
} -> tensor<f32>
flow.dispatch.tensor.store %100, %arg4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:f32>
flow.return
}
%93 = flow.dispatch.workgroups[%c1000, %c1, %c1](%90, %cst_261, %91, %92) : (tensor<1x1000xf32>, tensor<1x1000xf32>, tensor<f32>, tensor<f32>) -> tensor<1x1000xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x1000xf32>, %arg2: !flow.dispatch.tensor<readonly:1x1000xf32>, %arg3: !flow.dispatch.tensor<readonly:f32>, %arg4: !flow.dispatch.tensor<readonly:f32>, %arg5: !flow.dispatch.tensor<writeonly:1x1000xf32>) {
%c1_262 = constant 1 : index
%c1000_263 = constant 1000 : index
%94 = flow.dispatch.tensor.load %arg3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%95 = flow.dispatch.tensor.load %arg4, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%96 = linalg.init_tensor [1, 1000] : tensor<1x1000xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %97 to %c1_262 step %98 {
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%100 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %99 to %c1000_263 step %100 {
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg6, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1000)>(%arg7, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg6, %arg7], sizes = [%101, %102], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x1000xf32> -> tensor<?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg6, %workgroup_size_1)
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1000)>(%arg7, %workgroup_size_0)
%106 = flow.dispatch.tensor.load %arg2, offsets = [%arg6, %arg7], sizes = [%104, %105], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x1000xf32> -> tensor<?x?xf32>
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg6, %workgroup_size_1)
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1000)>(%arg7, %workgroup_size_0)
%109 = subtensor %96[%arg6, %arg7] [%107, %108] [1, 1] : tensor<1x1000xf32> to tensor<?x?xf32>
%110 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%103, %106, %94, %95 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<f32>, tensor<f32>) outs(%109 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): // no predecessors
%111 = addf %arg8, %arg9 : f32
%112 = subf %111, %arg10 : f32
%113 = math.exp %112 : f32
%114 = divf %113, %arg11 : f32
linalg.yield %114 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %110, %arg5, offsets = [%arg6, %arg7], sizes = [%107, %108], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1x1000xf32>
}
}
flow.return
}
return %93 : tensor<1x1000xf32>
}
// *** IR Dump After Canonicalizer ***
func @call(%arg0: tensor<1x224x224x3xf32> {tf._user_specified_name = "x"}) -> tensor<1x1000xf32> attributes {iree.module.export, iree.reflection = {abi = "sip", abiv = 1 : i32, f = "I17!B13!d1d224d224d3R11!B8!d1d1000", fv = "1", sip = "I8!S5!k0_0R3!_0"}, tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf.shape<1x224x224x3>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], tf.signature.is_stateful} {
%cst = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_0 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_1 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_2 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_3 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_4 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_5 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_6 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_7 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_8 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_9 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x96xf32>
%cst_10 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_11 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_12 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_13 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_14 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_15 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_16 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_17 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_18 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_19 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_20 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x96xf32>
%cst_21 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_22 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_23 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_24 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_25 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_26 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_27 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_28 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_29 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_30 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_31 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x96xf32>
%cst_32 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_33 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_34 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_35 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_36 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_37 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_38 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x576xf32>
%cst_39 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_40 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_41 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_42 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x576x160xf32>
%cst_43 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_44 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_45 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_46 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_47 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_48 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_49 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_50 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_51 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_52 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_53 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x160xf32>
%cst_54 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_55 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_56 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_57 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_58 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_59 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_60 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_61 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_62 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_63 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_64 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x160xf32>
%cst_65 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_66 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_67 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_68 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_69 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_70 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_71 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x160x960xf32>
%cst_72 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_73 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_74 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_75 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x960x320xf32>
%cst_76 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_77 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_78 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_79 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_80 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_81 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_82 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x16x96xf32>
%cst_83 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_84 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_85 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_86 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x96x24xf32>
%cst_87 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_88 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_89 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_90 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_91 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_92 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_93 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x24x144xf32>
%cst_94 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_95 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_96 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_97 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x144x24xf32>
%cst_98 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_99 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_100 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_101 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_102 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_103 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_104 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x24x144xf32>
%cst_105 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_106 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_107 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_108 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x144x32xf32>
%cst_109 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_110 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_111 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_112 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_113 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_114 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_115 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_116 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_117 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_118 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_119 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x32xf32>
%cst_120 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_121 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_122 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_123 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_124 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_125 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_126 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_127 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_128 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_129 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_130 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x32xf32>
%cst_131 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_132 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_133 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_134 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_135 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_136 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_137 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x192xf32>
%cst_138 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_139 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_140 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_141 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x192x64xf32>
%cst_142 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_143 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_144 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_145 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_146 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_147 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_148 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_149 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_150 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_151 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_152 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_153 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_154 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_155 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_156 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_157 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_158 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_159 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_160 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_161 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_162 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_163 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_164 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_165 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_166 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_167 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_168 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_169 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_170 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x64x384xf32>
%cst_171 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_172 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_173 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_174 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x384x64xf32>
%cst_175 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_176 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_177 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_178 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x3x32xf32>
%cst_179 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_180 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_181 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_182 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x320x1280xf32>
%cst_183 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_184 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_185 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_186 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_187 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_188 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_189 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1x32x16xf32>
%cst_190 = constant opaque<"_", "0xDEADBEEF"> : tensor<1000xf32>
%cst_191 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280x1000xf32>
%cst_192 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_193 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_194 = constant opaque<"_", "0xDEADBEEF"> : tensor<16xf32>
%cst_195 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_196 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_197 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_198 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_199 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_200 = constant opaque<"_", "0xDEADBEEF"> : tensor<24xf32>
%cst_201 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_202 = constant opaque<"_", "0xDEADBEEF"> : tensor<144xf32>
%cst_203 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_204 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_205 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_206 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_207 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_208 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_209 = constant opaque<"_", "0xDEADBEEF"> : tensor<32xf32>
%cst_210 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_211 = constant opaque<"_", "0xDEADBEEF"> : tensor<192xf32>
%cst_212 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_213 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_214 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_215 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_216 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_217 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_218 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_219 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_220 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_221 = constant opaque<"_", "0xDEADBEEF"> : tensor<64xf32>
%cst_222 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_223 = constant opaque<"_", "0xDEADBEEF"> : tensor<384xf32>
%cst_224 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_225 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_226 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_227 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_228 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_229 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_230 = constant opaque<"_", "0xDEADBEEF"> : tensor<96xf32>
%cst_231 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_232 = constant opaque<"_", "0xDEADBEEF"> : tensor<576xf32>
%cst_233 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_234 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_235 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_236 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_237 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_238 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_239 = constant opaque<"_", "0xDEADBEEF"> : tensor<160xf32>
%cst_240 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_241 = constant opaque<"_", "0xDEADBEEF"> : tensor<960xf32>
%cst_242 = constant opaque<"_", "0xDEADBEEF"> : tensor<320xf32>
%cst_243 = constant opaque<"_", "0xDEADBEEF"> : tensor<1280xf32>
%cst_244 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x32xf32>
%cst_245 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x96xf32>
%cst_246 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x144xf32>
%cst_247 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x144xf32>
%cst_248 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_249 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_250 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x192xf32>
%cst_251 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_252 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_253 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_254 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x384xf32>
%cst_255 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_256 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_257 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x576xf32>
%cst_258 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_259 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_260 = constant opaque<"_", "0xDEADBEEF"> : tensor<3x3x960xf32>
%cst_261 = constant opaque<"_", "0xDEADBEEF"> : tensor<1x1000xf32>
%c112 = constant 112 : index
%c24 = constant 24 : index
%c56 = constant 56 : index
%c28 = constant 28 : index
%c64 = constant 64 : index
%c14 = constant 14 : index
%c160 = constant 160 : index
%c320 = constant 320 : index
%c7 = constant 7 : index
%c1280 = constant 1280 : index
%c1000 = constant 1000 : index
%c3 = constant 3 : index
%c225 = constant 225 : index
%c32 = constant 32 : index
%c114 = constant 114 : index
%c96 = constant 96 : index
%c113 = constant 113 : index
%c58 = constant 58 : index
%c144 = constant 144 : index
%c57 = constant 57 : index
%c30 = constant 30 : index
%c192 = constant 192 : index
%c29 = constant 29 : index
%c384 = constant 384 : index
%c16 = constant 16 : index
%c576 = constant 576 : index
%c15 = constant 15 : index
%c960 = constant 960 : index
%c9 = constant 9 : index
%c1 = constant 1 : index
%0 = flow.dispatch.workgroups[%c3, %c225, %c225]() : () -> tensor<1x225x225x3xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x225x225x3xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 225, 225, 3] : tensor<1x225x225x3xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x225x225x3xf32>, f32 -> tensor<1x225x225x3xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x225x225x3xf32> -> !flow.dispatch.tensor<writeonly:1x225x225x3xf32>
flow.return
}
%1 = flow.dispatch.workgroups[%c3, %c225, %c225](%arg0, %0) : (tensor<1x224x224x3xf32>, tensor<1x225x225x3xf32>) -> %0 =
(%arg1: !flow.dispatch.tensor<readonly:1x224x224x3xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x225x225x3xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x225x225x3xf32> -> tensor<1x225x225x3xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 224, 224, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> into tensor<1x225x225x3xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x225x225x3xf32> -> !flow.dispatch.tensor<readwrite:1x225x225x3xf32>
flow.return
}
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%cst_175, %cst_177, %cst_192, %cst_176, %1, %cst_178) : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c112_265 = constant 112 : index
%c32_266 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c112_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c112_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c32_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%115 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%100, %arg8)
%116 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%117 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%101, %arg9)
%118 = flow.dispatch.tensor.load %arg5, offsets = [0, %114, %116, 0], sizes = [1, %115, %117, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [3, 3, 3, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%128 = subf %arg11, %arg12 : f32
%129 = mulf %128, %arg13 : f32
%130 = divf %129, %arg14 : f32
%131 = addf %130, %arg15 : f32
%132 = cmpf olt, %131, %cst_263 : f32
%133 = select %132, %131, %cst_263 : f32
%134 = cmpf uno, %131, %cst_263 : f32
%135 = select %134, %cst_262, %133 : f32
%136 = cmpf ogt, %135, %cst_264 : f32
%137 = select %136, %135, %cst_264 : f32
%138 = cmpf uno, %135, %cst_264 : f32
%139 = select %138, %cst_262, %137 : f32
linalg.yield %139 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = flow.dispatch.workgroups[%c32, %c114, %c114]() : () -> tensor<1x114x114x32xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x114x114x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 114, 114, 32] : tensor<1x114x114x32xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x114x114x32xf32>, f32 -> tensor<1x114x114x32xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x114x114x32xf32> -> !flow.dispatch.tensor<writeonly:1x114x114x32xf32>
flow.return
}
%4 = flow.dispatch.workgroups[%c32, %c114, %c114](%2, %3) : (tensor<1x112x112x32xf32>, tensor<1x114x114x32xf32>) -> %3 =
(%arg1: !flow.dispatch.tensor<readonly:1x112x112x32xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x114x114x32xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x114x114x32xf32> -> tensor<1x114x114x32xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 112, 112, 32] [1, 1, 1, 1] : tensor<1x112x112x32xf32> into tensor<1x114x114x32xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x114x114x32xf32> -> !flow.dispatch.tensor<readwrite:1x114x114x32xf32>
flow.return
}
%5 = flow.dispatch.workgroups[%c32, %c112, %c112](%cst_183, %cst_185, %cst_193, %cst_184, %4, %cst_244) : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x114x114x32xf32>, tensor<3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:1x114x114x32xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x32xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c112_265 = constant 112 : index
%c32_266 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c112_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c112_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c32_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 114)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 114)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x114x114x32xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x32xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%6 = flow.dispatch.workgroups[%c16, %c112, %c112](%cst_186, %cst_188, %cst_194, %cst_187, %5, %cst_189) : (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<1x112x112x32xf32>, tensor<1x1x32x16xf32>) -> tensor<1x112x112x16xf32> =
(%arg1: !flow.dispatch.tensor<readonly:16xf32>, %arg2: !flow.dispatch.tensor<readonly:16xf32>, %arg3: !flow.dispatch.tensor<readonly:16xf32>, %arg4: !flow.dispatch.tensor<readonly:16xf32>, %arg5: !flow.dispatch.tensor<readonly:1x112x112x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x16xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x16xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c112_263 = constant 112 : index
%c16_264 = constant 16 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c112_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c112_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c16_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:16xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 16)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x?x?x32xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 16, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x16xf32> -> tensor<1x1x32x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 16, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
}
}
}
flow.return
}
%7 = flow.dispatch.workgroups[%c96, %c112, %c112](%cst_79, %cst_81, %cst_195, %cst_80, %6, %cst_82) : (tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x112x112x16xf32>, tensor<1x1x16x96xf32>) -> tensor<1x112x112x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:1x112x112x16xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x16x96xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x112x112x96xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c112_265 = constant 112 : index
%c96_266 = constant 96 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c112_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c112_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c96_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x16xf32> -> tensor<1x?x?x16xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 16, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x16x96xf32> -> tensor<1x1x16x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x16xf32>, tensor<1x1x16x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x96xf32>
}
}
}
flow.return
}
%8 = flow.dispatch.workgroups[%c96, %c113, %c113]() : () -> tensor<1x113x113x96xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x113x113x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 113, 113, 96] : tensor<1x113x113x96xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x113x113x96xf32>, f32 -> tensor<1x113x113x96xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x113x113x96xf32> -> !flow.dispatch.tensor<writeonly:1x113x113x96xf32>
flow.return
}
%9 = flow.dispatch.workgroups[%c96, %c113, %c113](%7, %8) : (tensor<1x112x112x96xf32>, tensor<1x113x113x96xf32>) -> %8 =
(%arg1: !flow.dispatch.tensor<readonly:1x112x112x96xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x113x113x96xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x112x112x96xf32> -> tensor<1x112x112x96xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x113x113x96xf32> -> tensor<1x113x113x96xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 112, 112, 96] [1, 1, 1, 1] : tensor<1x112x112x96xf32> into tensor<1x113x113x96xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x113x113x96xf32> -> !flow.dispatch.tensor<readwrite:1x113x113x96xf32>
flow.return
}
%10 = flow.dispatch.workgroups[%c96, %c56, %c56](%cst_76, %cst_78, %cst_196, %cst_77, %9, %cst_245) : (tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:1x113x113x96xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x96xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x96xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c96_266 = constant 96 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c56_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c56_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c96_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%115 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 113)>(%100, %arg8)
%116 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%117 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 113)>(%101, %arg9)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg5, offsets = [0, %114, %116, %arg10], sizes = [1, %115, %117, %118], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x113x113x96xf32> -> tensor<1x?x?x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x96xf32> -> tensor<3x3x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%125 = linalg.init_tensor [1, %122, %123, %124] : tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%129 = subf %arg11, %arg12 : f32
%130 = mulf %129, %arg13 : f32
%131 = divf %130, %arg14 : f32
%132 = addf %131, %arg15 : f32
%133 = cmpf olt, %132, %cst_263 : f32
%134 = select %133, %132, %cst_263 : f32
%135 = cmpf uno, %132, %cst_263 : f32
%136 = select %135, %cst_262, %134 : f32
%137 = cmpf ogt, %136, %cst_264 : f32
%138 = select %137, %136, %cst_264 : f32
%139 = cmpf uno, %136, %cst_264 : f32
%140 = select %139, %cst_262, %138 : f32
linalg.yield %140 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %128, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x96xf32>
}
}
}
flow.return
}
%11 = flow.dispatch.workgroups[%c24, %c56, %c56](%cst_83, %cst_85, %cst_197, %cst_84, %10, %cst_86) : (tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<1x56x56x96xf32>, tensor<1x1x96x24xf32>) -> tensor<1x56x56x24xf32> =
(%arg1: !flow.dispatch.tensor<readonly:24xf32>, %arg2: !flow.dispatch.tensor<readonly:24xf32>, %arg3: !flow.dispatch.tensor<readonly:24xf32>, %arg4: !flow.dispatch.tensor<readonly:24xf32>, %arg5: !flow.dispatch.tensor<readonly:1x56x56x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x24xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x24xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c56_263 = constant 56 : index
%c24_264 = constant 24 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c56_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c56_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c24_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x96xf32> -> tensor<1x?x?x96xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x24xf32> -> tensor<1x1x96x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x24xf32>
}
}
}
flow.return
}
%12 = flow.dispatch.workgroups[%c144, %c56, %c56](%cst_90, %cst_92, %cst_198, %cst_91, %11, %cst_93) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x56x56x24xf32>, tensor<1x1x24x144xf32>) -> tensor<1x56x56x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x56x56x24xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x24x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c144_266 = constant 144 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c56_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c56_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c144_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 24], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x24xf32> -> tensor<1x?x?x24xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 24, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x24x144xf32> -> tensor<1x1x24x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x24xf32>, tensor<1x1x24x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x144xf32>
}
}
}
flow.return
}
%13 = flow.dispatch.workgroups[%c144, %c58, %c58]() : () -> tensor<1x58x58x144xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x58x58x144xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 58, 58, 144] : tensor<1x58x58x144xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x58x58x144xf32>, f32 -> tensor<1x58x58x144xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x58x58x144xf32> -> !flow.dispatch.tensor<writeonly:1x58x58x144xf32>
flow.return
}
%14 = flow.dispatch.workgroups[%c144, %c58, %c58](%12, %13) : (tensor<1x56x56x144xf32>, tensor<1x58x58x144xf32>) -> %13 =
(%arg1: !flow.dispatch.tensor<readonly:1x56x56x144xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x58x58x144xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x56x56x144xf32> -> tensor<1x56x56x144xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x58x58x144xf32> -> tensor<1x58x58x144xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 56, 56, 144] [1, 1, 1, 1] : tensor<1x56x56x144xf32> into tensor<1x58x58x144xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x58x58x144xf32> -> !flow.dispatch.tensor<readwrite:1x58x58x144xf32>
flow.return
}
%15 = flow.dispatch.workgroups[%c144, %c56, %c56](%cst_87, %cst_89, %cst_199, %cst_88, %14, %cst_246) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x58x58x144xf32>, tensor<3x3x144xf32>) -> tensor<1x56x56x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x58x58x144xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c144_266 = constant 144 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c56_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c56_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c144_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 58)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 58)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x58x58x144xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x144xf32>
}
}
}
flow.return
}
%16 = flow.dispatch.workgroups[%c24, %c56, %c56](%11, %cst_94, %cst_96, %cst_200, %cst_95, %15, %cst_97) : (tensor<1x56x56x24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<1x56x56x144xf32>, tensor<1x1x144x24xf32>) -> tensor<1x56x56x24xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x56x56x24xf32>, %arg2: !flow.dispatch.tensor<readonly:24xf32>, %arg3: !flow.dispatch.tensor<readonly:24xf32>, %arg4: !flow.dispatch.tensor<readonly:24xf32>, %arg5: !flow.dispatch.tensor<readonly:24xf32>, %arg6: !flow.dispatch.tensor<readonly:1x56x56x144xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x144x24xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x56x56x24xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c56_263 = constant 56 : index
%c24_264 = constant 24 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c56_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c56_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c24_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x24xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:24xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 24)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x144xf32> -> tensor<1x?x?x144xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 144, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x144x24xf32> -> tensor<1x1x144x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 24, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x144xf32>, tensor<1x1x144x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x24xf32>
}
}
}
flow.return
}
%17 = flow.dispatch.workgroups[%c144, %c56, %c56](%cst_101, %cst_103, %cst_201, %cst_102, %16, %cst_104) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x56x56x24xf32>, tensor<1x1x24x144xf32>) -> tensor<1x56x56x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x56x56x24xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x24x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x56x56x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c56_265 = constant 56 : index
%c144_266 = constant 144 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c56_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c56_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c144_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 56)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 24], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x56x56x24xf32> -> tensor<1x?x?x24xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 24, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x24x144xf32> -> tensor<1x1x24x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 56, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x24xf32>, tensor<1x1x24x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x56x56x144xf32>
}
}
}
flow.return
}
%18 = flow.dispatch.workgroups[%c144, %c57, %c57]() : () -> tensor<1x57x57x144xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x57x57x144xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 57, 57, 144] : tensor<1x57x57x144xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x57x57x144xf32>, f32 -> tensor<1x57x57x144xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x57x57x144xf32> -> !flow.dispatch.tensor<writeonly:1x57x57x144xf32>
flow.return
}
%19 = flow.dispatch.workgroups[%c144, %c57, %c57](%17, %18) : (tensor<1x56x56x144xf32>, tensor<1x57x57x144xf32>) -> %18 =
(%arg1: !flow.dispatch.tensor<readonly:1x56x56x144xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x57x57x144xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x56x56x144xf32> -> tensor<1x56x56x144xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x57x57x144xf32> -> tensor<1x57x57x144xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 56, 56, 144] [1, 1, 1, 1] : tensor<1x56x56x144xf32> into tensor<1x57x57x144xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x57x57x144xf32> -> !flow.dispatch.tensor<readwrite:1x57x57x144xf32>
flow.return
}
%20 = flow.dispatch.workgroups[%c144, %c28, %c28](%cst_98, %cst_100, %cst_202, %cst_99, %19, %cst_247) : (tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) -> tensor<1x28x28x144xf32> =
(%arg1: !flow.dispatch.tensor<readonly:144xf32>, %arg2: !flow.dispatch.tensor<readonly:144xf32>, %arg3: !flow.dispatch.tensor<readonly:144xf32>, %arg4: !flow.dispatch.tensor<readonly:144xf32>, %arg5: !flow.dispatch.tensor<readonly:1x57x57x144xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x144xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x144xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c144_266 = constant 144 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c144_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:144xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 144)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%115 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 57)>(%100, %arg8)
%116 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%117 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 57)>(%101, %arg9)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg5, offsets = [0, %114, %116, %arg10], sizes = [1, %115, %117, %118], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x57x57x144xf32> -> tensor<1x?x?x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 144, d1)>(%arg10, %workgroup_size_0)
%125 = linalg.init_tensor [1, %122, %123, %124] : tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%129 = subf %arg11, %arg12 : f32
%130 = mulf %129, %arg13 : f32
%131 = divf %130, %arg14 : f32
%132 = addf %131, %arg15 : f32
%133 = cmpf olt, %132, %cst_263 : f32
%134 = select %133, %132, %cst_263 : f32
%135 = cmpf uno, %132, %cst_263 : f32
%136 = select %135, %cst_262, %134 : f32
%137 = cmpf ogt, %136, %cst_264 : f32
%138 = select %137, %136, %cst_264 : f32
%139 = cmpf uno, %136, %cst_264 : f32
%140 = select %139, %cst_262, %138 : f32
linalg.yield %140 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %128, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x144xf32>
}
}
}
flow.return
}
%21 = flow.dispatch.workgroups[%c32, %c28, %c28](%cst_105, %cst_107, %cst_203, %cst_106, %20, %cst_108) : (tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x28x28x144xf32>, tensor<1x1x144x32xf32>) -> tensor<1x28x28x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x144xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x144x32xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c28_263 = constant 28 : index
%c32_264 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c32_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x144xf32> -> tensor<1x?x?x144xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 144, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x144x32xf32> -> tensor<1x1x144x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x144xf32>, tensor<1x1x144x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x32xf32>
}
}
}
flow.return
}
%22 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_112, %cst_114, %cst_204, %cst_113, %21, %cst_115) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c192_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x32xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x192xf32> -> tensor<1x1x32x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%23 = flow.dispatch.workgroups[%c192, %c30, %c30]() : () -> tensor<1x30x30x192xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x30x30x192xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 30, 30, 192] : tensor<1x30x30x192xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x30x30x192xf32>, f32 -> tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<writeonly:1x30x30x192xf32>
flow.return
}
%24 = flow.dispatch.workgroups[%c192, %c30, %c30](%22, %23) : (tensor<1x28x28x192xf32>, tensor<1x30x30x192xf32>) -> %23 =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x30x30x192xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x28x28x192xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x30x30x192xf32> -> tensor<1x30x30x192xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<readwrite:1x30x30x192xf32>
flow.return
}
%25 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_109, %cst_111, %cst_205, %cst_110, %24, %cst_248) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x30x30x192xf32>, tensor<3x3x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x30x30x192xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c192_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x30x30x192xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x192xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%26 = flow.dispatch.workgroups[%c32, %c28, %c28](%21, %cst_116, %cst_118, %cst_206, %cst_117, %25, %cst_119) : (tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x28x28x192xf32>, tensor<1x1x192x32xf32>) -> tensor<1x28x28x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x192x32xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x28x28x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c28_263 = constant 28 : index
%c32_264 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c28_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c28_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c32_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 192], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x?x?x192xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 192, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x192x32xf32> -> tensor<1x1x192x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x192xf32>, tensor<1x1x192x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x32xf32>
}
}
}
flow.return
}
%27 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_123, %cst_125, %cst_207, %cst_124, %26, %cst_126) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c192_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x32xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x192xf32> -> tensor<1x1x32x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%28 = flow.dispatch.workgroups[%c192, %c30, %c30]() : () -> tensor<1x30x30x192xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x30x30x192xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 30, 30, 192] : tensor<1x30x30x192xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x30x30x192xf32>, f32 -> tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<writeonly:1x30x30x192xf32>
flow.return
}
%29 = flow.dispatch.workgroups[%c192, %c30, %c30](%27, %28) : (tensor<1x28x28x192xf32>, tensor<1x30x30x192xf32>) -> %28 =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x30x30x192xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x28x28x192xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x30x30x192xf32> -> tensor<1x30x30x192xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x30x30x192xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x30x30x192xf32> -> !flow.dispatch.tensor<readwrite:1x30x30x192xf32>
flow.return
}
%30 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_120, %cst_122, %cst_208, %cst_121, %29, %cst_249) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x30x30x192xf32>, tensor<3x3x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x30x30x192xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c192_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 30)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x30x30x192xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x192xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%31 = flow.dispatch.workgroups[%c32, %c28, %c28](%26, %cst_127, %cst_129, %cst_209, %cst_128, %30, %cst_130) : (tensor<1x28x28x32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<1x28x28x192xf32>, tensor<1x1x192x32xf32>) -> tensor<1x28x28x32xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg2: !flow.dispatch.tensor<readonly:32xf32>, %arg3: !flow.dispatch.tensor<readonly:32xf32>, %arg4: !flow.dispatch.tensor<readonly:32xf32>, %arg5: !flow.dispatch.tensor<readonly:32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x192x32xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x28x28x32xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c28_263 = constant 28 : index
%c32_264 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c28_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c28_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c32_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 192], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x?x?x192xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 192, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x192x32xf32> -> tensor<1x1x192x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x192xf32>, tensor<1x1x192x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x32xf32>
}
}
}
flow.return
}
%32 = flow.dispatch.workgroups[%c192, %c28, %c28](%cst_134, %cst_136, %cst_210, %cst_135, %31, %cst_137) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x28x28x32xf32>, tensor<1x1x32x192xf32>) -> tensor<1x28x28x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x28x28x32xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x32x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x28x28x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c28_265 = constant 28 : index
%c192_266 = constant 192 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c28_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c28_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c192_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 28)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x28x28x32xf32> -> tensor<1x?x?x32xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 32, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x32x192xf32> -> tensor<1x1x32x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 28, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x32xf32>, tensor<1x1x32x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x28x28x192xf32>
}
}
}
flow.return
}
%33 = flow.dispatch.workgroups[%c192, %c29, %c29]() : () -> tensor<1x29x29x192xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x29x29x192xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 29, 29, 192] : tensor<1x29x29x192xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x29x29x192xf32>, f32 -> tensor<1x29x29x192xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x29x29x192xf32> -> !flow.dispatch.tensor<writeonly:1x29x29x192xf32>
flow.return
}
%34 = flow.dispatch.workgroups[%c192, %c29, %c29](%32, %33) : (tensor<1x28x28x192xf32>, tensor<1x29x29x192xf32>) -> %33 =
(%arg1: !flow.dispatch.tensor<readonly:1x28x28x192xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x29x29x192xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x28x28x192xf32> -> tensor<1x28x28x192xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x29x29x192xf32> -> tensor<1x29x29x192xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 28, 28, 192] [1, 1, 1, 1] : tensor<1x28x28x192xf32> into tensor<1x29x29x192xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x29x29x192xf32> -> !flow.dispatch.tensor<readwrite:1x29x29x192xf32>
flow.return
}
%35 = flow.dispatch.workgroups[%c192, %c14, %c14](%cst_131, %cst_133, %cst_211, %cst_132, %34, %cst_250) : (tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<192xf32>, tensor<1x29x29x192xf32>, tensor<3x3x192xf32>) -> tensor<1x14x14x192xf32> =
(%arg1: !flow.dispatch.tensor<readonly:192xf32>, %arg2: !flow.dispatch.tensor<readonly:192xf32>, %arg3: !flow.dispatch.tensor<readonly:192xf32>, %arg4: !flow.dispatch.tensor<readonly:192xf32>, %arg5: !flow.dispatch.tensor<readonly:1x29x29x192xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x192xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x192xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c192_266 = constant 192 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c192_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:192xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 192)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%115 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 29)>(%100, %arg8)
%116 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%117 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 29)>(%101, %arg9)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg5, offsets = [0, %114, %116, %arg10], sizes = [1, %115, %117, %118], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x29x29x192xf32> -> tensor<1x?x?x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x192xf32> -> tensor<3x3x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 192, d1)>(%arg10, %workgroup_size_0)
%125 = linalg.init_tensor [1, %122, %123, %124] : tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%129 = subf %arg11, %arg12 : f32
%130 = mulf %129, %arg13 : f32
%131 = divf %130, %arg14 : f32
%132 = addf %131, %arg15 : f32
%133 = cmpf olt, %132, %cst_263 : f32
%134 = select %133, %132, %cst_263 : f32
%135 = cmpf uno, %132, %cst_263 : f32
%136 = select %135, %cst_262, %134 : f32
%137 = cmpf ogt, %136, %cst_264 : f32
%138 = select %137, %136, %cst_264 : f32
%139 = cmpf uno, %136, %cst_264 : f32
%140 = select %139, %cst_262, %138 : f32
linalg.yield %140 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %128, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x192xf32>
}
}
}
flow.return
}
%36 = flow.dispatch.workgroups[%c64, %c14, %c14](%cst_138, %cst_140, %cst_212, %cst_139, %35, %cst_141) : (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x192xf32>, tensor<1x1x192x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x192xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x192x64xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c64_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 192], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x192xf32> -> tensor<1x?x?x192xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 192, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x192x64xf32> -> tensor<1x1x192x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x192xf32>, tensor<1x1x192x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%37 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_145, %cst_147, %cst_213, %cst_146, %36, %cst_148) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%38 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%39 = flow.dispatch.workgroups[%c384, %c16, %c16](%37, %38) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %38 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%40 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_142, %cst_144, %cst_214, %cst_143, %39, %cst_251) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%41 = flow.dispatch.workgroups[%c64, %c14, %c14](%36, %cst_149, %cst_151, %cst_215, %cst_150, %40, %cst_152) : (tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x384x64xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c64_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 384, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x64xf32> -> tensor<1x1x384x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%42 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_156, %cst_158, %cst_216, %cst_157, %41, %cst_159) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%43 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%44 = flow.dispatch.workgroups[%c384, %c16, %c16](%42, %43) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %43 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%45 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_153, %cst_155, %cst_217, %cst_154, %44, %cst_252) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%46 = flow.dispatch.workgroups[%c64, %c14, %c14](%41, %cst_160, %cst_162, %cst_218, %cst_161, %45, %cst_163) : (tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x384x64xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c64_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 384, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x64xf32> -> tensor<1x1x384x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%47 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_167, %cst_169, %cst_219, %cst_168, %46, %cst_170) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%48 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%49 = flow.dispatch.workgroups[%c384, %c16, %c16](%47, %48) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %48 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%50 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_164, %cst_166, %cst_220, %cst_165, %49, %cst_253) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%51 = flow.dispatch.workgroups[%c64, %c14, %c14](%46, %cst_171, %cst_173, %cst_221, %cst_172, %50, %cst_174) : (tensor<1x14x14x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x64xf32>) -> tensor<1x14x14x64xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg2: !flow.dispatch.tensor<readonly:64xf32>, %arg3: !flow.dispatch.tensor<readonly:64xf32>, %arg4: !flow.dispatch.tensor<readonly:64xf32>, %arg5: !flow.dispatch.tensor<readonly:64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x384x64xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x64xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c64_264 = constant 64 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c64_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:64xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 64)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 384, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x64xf32> -> tensor<1x1x384x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 64, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x64xf32>
}
}
}
flow.return
}
%52 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst_2, %cst_4, %cst_222, %cst_3, %51, %cst_5) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x14x14x64xf32>, tensor<1x1x64x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x64xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x64x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x64xf32> -> tensor<1x?x?x64xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 64, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x64x384xf32> -> tensor<1x1x64x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x64xf32>, tensor<1x1x64x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%53 = flow.dispatch.workgroups[%c384, %c16, %c16]() : () -> tensor<1x16x16x384xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x384xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 384] : tensor<1x16x16x384xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x384xf32>, f32 -> tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x384xf32>
flow.return
}
%54 = flow.dispatch.workgroups[%c384, %c16, %c16](%52, %53) : (tensor<1x14x14x384xf32>, tensor<1x16x16x384xf32>) -> %53 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x384xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x14x14x384xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x384xf32> -> tensor<1x16x16x384xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 384] [1, 1, 1, 1] : tensor<1x14x14x384xf32> into tensor<1x16x16x384xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x384xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x384xf32>
flow.return
}
%55 = flow.dispatch.workgroups[%c384, %c14, %c14](%cst, %cst_1, %cst_223, %cst_0, %54, %cst_254) : (tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<384xf32>, tensor<1x16x16x384xf32>, tensor<3x3x384xf32>) -> tensor<1x14x14x384xf32> =
(%arg1: !flow.dispatch.tensor<readonly:384xf32>, %arg2: !flow.dispatch.tensor<readonly:384xf32>, %arg3: !flow.dispatch.tensor<readonly:384xf32>, %arg4: !flow.dispatch.tensor<readonly:384xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x384xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x384xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x384xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c384_266 = constant 384 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c384_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:384xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 384)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x384xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x384xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 384, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x384xf32>
}
}
}
flow.return
}
%56 = flow.dispatch.workgroups[%c96, %c14, %c14](%cst_6, %cst_8, %cst_224, %cst_7, %55, %cst_9) : (tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x14x14x384xf32>, tensor<1x1x384x96xf32>) -> tensor<1x14x14x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x384xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x384x96xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c96_264 = constant 96 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c96_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x384xf32> -> tensor<1x?x?x384xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 384, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x384x96xf32> -> tensor<1x1x384x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x384xf32>, tensor<1x1x384x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x96xf32>
}
}
}
flow.return
}
%57 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_13, %cst_15, %cst_225, %cst_14, %56, %cst_16) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c576_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x96xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x576xf32> -> tensor<1x1x96x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%58 = flow.dispatch.workgroups[%c576, %c16, %c16]() : () -> tensor<1x16x16x576xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x576xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 576] : tensor<1x16x16x576xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x576xf32>, f32 -> tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x576xf32>
flow.return
}
%59 = flow.dispatch.workgroups[%c576, %c16, %c16](%57, %58) : (tensor<1x14x14x576xf32>, tensor<1x16x16x576xf32>) -> %58 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x576xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x14x14x576xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x576xf32> -> tensor<1x16x16x576xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x576xf32>
flow.return
}
%60 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_10, %cst_12, %cst_226, %cst_11, %59, %cst_255) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x16x16x576xf32>, tensor<3x3x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x576xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c576_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x576xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x576xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%61 = flow.dispatch.workgroups[%c96, %c14, %c14](%56, %cst_17, %cst_19, %cst_227, %cst_18, %60, %cst_20) : (tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x14x14x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x14x14x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x576x96xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c96_264 = constant 96 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c96_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x?x?x576xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 576, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x576x96xf32> -> tensor<1x1x576x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x576xf32>, tensor<1x1x576x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x96xf32>
}
}
}
flow.return
}
%62 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_24, %cst_26, %cst_228, %cst_25, %61, %cst_27) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c576_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x96xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x576xf32> -> tensor<1x1x96x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%63 = flow.dispatch.workgroups[%c576, %c16, %c16]() : () -> tensor<1x16x16x576xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x16x16x576xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 16, 16, 576] : tensor<1x16x16x576xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x16x16x576xf32>, f32 -> tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<writeonly:1x16x16x576xf32>
flow.return
}
%64 = flow.dispatch.workgroups[%c576, %c16, %c16](%62, %63) : (tensor<1x14x14x576xf32>, tensor<1x16x16x576xf32>) -> %63 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x16x16x576xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x14x14x576xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x16x16x576xf32> -> tensor<1x16x16x576xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x16x16x576xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x16x16x576xf32> -> !flow.dispatch.tensor<readwrite:1x16x16x576xf32>
flow.return
}
%65 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_21, %cst_23, %cst_229, %cst_22, %64, %cst_256) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x16x16x576xf32>, tensor<3x3x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x16x16x576xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c576_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 16)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x16x16x576xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x576xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%66 = flow.dispatch.workgroups[%c96, %c14, %c14](%61, %cst_28, %cst_30, %cst_230, %cst_29, %65, %cst_31) : (tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<1x14x14x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x14x14x96xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg2: !flow.dispatch.tensor<readonly:96xf32>, %arg3: !flow.dispatch.tensor<readonly:96xf32>, %arg4: !flow.dispatch.tensor<readonly:96xf32>, %arg5: !flow.dispatch.tensor<readonly:96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x576x96xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x14x14x96xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c14_263 = constant 14 : index
%c96_264 = constant 96 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c14_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c14_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c96_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:96xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 96)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x?x?x576xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 576, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x576x96xf32> -> tensor<1x1x576x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 96, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x576xf32>, tensor<1x1x576x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x96xf32>
}
}
}
flow.return
}
%67 = flow.dispatch.workgroups[%c576, %c14, %c14](%cst_35, %cst_37, %cst_231, %cst_36, %66, %cst_38) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x14x14x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x14x14x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x14x14x96xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x96x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x14x14x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c14_265 = constant 14 : index
%c576_266 = constant 576 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c14_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c14_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c576_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 14)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x14x14x96xf32> -> tensor<1x?x?x96xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 96, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x96x576xf32> -> tensor<1x1x96x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 14, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x96xf32>, tensor<1x1x96x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x14x14x576xf32>
}
}
}
flow.return
}
%68 = flow.dispatch.workgroups[%c576, %c15, %c15]() : () -> tensor<1x15x15x576xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x15x15x576xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 15, 15, 576] : tensor<1x15x15x576xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x15x15x576xf32>, f32 -> tensor<1x15x15x576xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x15x15x576xf32> -> !flow.dispatch.tensor<writeonly:1x15x15x576xf32>
flow.return
}
%69 = flow.dispatch.workgroups[%c576, %c15, %c15](%67, %68) : (tensor<1x14x14x576xf32>, tensor<1x15x15x576xf32>) -> %68 =
(%arg1: !flow.dispatch.tensor<readonly:1x14x14x576xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x15x15x576xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x14x14x576xf32> -> tensor<1x14x14x576xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x15x15x576xf32> -> tensor<1x15x15x576xf32>
%96 = subtensor_insert %94 into %95[0, 0, 0, 0] [1, 14, 14, 576] [1, 1, 1, 1] : tensor<1x14x14x576xf32> into tensor<1x15x15x576xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x15x15x576xf32> -> !flow.dispatch.tensor<readwrite:1x15x15x576xf32>
flow.return
}
%70 = flow.dispatch.workgroups[%c576, %c7, %c7](%cst_32, %cst_34, %cst_232, %cst_33, %69, %cst_257) : (tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<1x15x15x576xf32>, tensor<3x3x576xf32>) -> tensor<1x7x7x576xf32> =
(%arg1: !flow.dispatch.tensor<readonly:576xf32>, %arg2: !flow.dispatch.tensor<readonly:576xf32>, %arg3: !flow.dispatch.tensor<readonly:576xf32>, %arg4: !flow.dispatch.tensor<readonly:576xf32>, %arg5: !flow.dispatch.tensor<readonly:1x15x15x576xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x576xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x576xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c576_266 = constant 576 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c576_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:576xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 576)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%115 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 15)>(%100, %arg8)
%116 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg9)
%117 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 15)>(%101, %arg9)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg5, offsets = [0, %114, %116, %arg10], sizes = [1, %115, %117, %118], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x15x15x576xf32> -> tensor<1x?x?x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%121 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x576xf32> -> tensor<3x3x?xf32>
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%124 = affine.min affine_map<(d0, d1) -> (-d0 + 576, d1)>(%arg10, %workgroup_size_0)
%125 = linalg.init_tensor [1, %122, %123, %124] : tensor<1x?x?x?xf32>
%126 = linalg.fill(%125, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%127 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<2> : tensor<2xi64>} ins(%119, %121 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%126 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%128 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%129 = subf %arg11, %arg12 : f32
%130 = mulf %129, %arg13 : f32
%131 = divf %130, %arg14 : f32
%132 = addf %131, %arg15 : f32
%133 = cmpf olt, %132, %cst_263 : f32
%134 = select %133, %132, %cst_263 : f32
%135 = cmpf uno, %132, %cst_263 : f32
%136 = select %135, %cst_262, %134 : f32
%137 = cmpf ogt, %136, %cst_264 : f32
%138 = select %137, %136, %cst_264 : f32
%139 = cmpf uno, %136, %cst_264 : f32
%140 = select %139, %cst_262, %138 : f32
linalg.yield %140 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %128, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x576xf32>
}
}
}
flow.return
}
%71 = flow.dispatch.workgroups[%c160, %c7, %c7](%cst_39, %cst_41, %cst_233, %cst_40, %70, %cst_42) : (tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<1x7x7x576xf32>, tensor<1x1x576x160xf32>) -> tensor<1x7x7x160xf32> =
(%arg1: !flow.dispatch.tensor<readonly:160xf32>, %arg2: !flow.dispatch.tensor<readonly:160xf32>, %arg3: !flow.dispatch.tensor<readonly:160xf32>, %arg4: !flow.dispatch.tensor<readonly:160xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x576xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x576x160xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x160xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c160_264 = constant 160 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c160_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x576xf32> -> tensor<1x?x?x576xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 576, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x576x160xf32> -> tensor<1x1x576x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x576xf32>, tensor<1x1x576x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x160xf32>
}
}
}
flow.return
}
%72 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_46, %cst_48, %cst_234, %cst_47, %71, %cst_49) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x160x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c960_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 160], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x160xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 160, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x160x960xf32> -> tensor<1x1x160x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x160xf32>, tensor<1x1x160x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%73 = flow.dispatch.workgroups[%c960, %c9, %c9]() : () -> tensor<1x9x9x960xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x9x9x960xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<writeonly:1x9x9x960xf32>
flow.return
}
%74 = flow.dispatch.workgroups[%c960, %c9, %c9](%72, %73) : (tensor<1x7x7x960xf32>, tensor<1x9x9x960xf32>) -> %73 =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x9x9x960xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x7x7x960xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x9x9x960xf32> -> tensor<1x9x9x960xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<readwrite:1x9x9x960xf32>
flow.return
}
%75 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_43, %cst_45, %cst_235, %cst_44, %74, %cst_258) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x9x9x960xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c960_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x9x9x960xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x960xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%76 = flow.dispatch.workgroups[%c160, %c7, %c7](%71, %cst_50, %cst_52, %cst_236, %cst_51, %75, %cst_53) : (tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<1x7x7x960xf32>, tensor<1x1x960x160xf32>) -> tensor<1x7x7x160xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg2: !flow.dispatch.tensor<readonly:160xf32>, %arg3: !flow.dispatch.tensor<readonly:160xf32>, %arg4: !flow.dispatch.tensor<readonly:160xf32>, %arg5: !flow.dispatch.tensor<readonly:160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x960x160xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x7x7x160xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c160_264 = constant 160 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c7_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c7_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c160_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 960], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x?x?x960xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 960, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x960x160xf32> -> tensor<1x1x960x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x960xf32>, tensor<1x1x960x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x160xf32>
}
}
}
flow.return
}
%77 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_57, %cst_59, %cst_237, %cst_58, %76, %cst_60) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x160x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c960_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 160], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x160xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 160, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x160x960xf32> -> tensor<1x1x160x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x160xf32>, tensor<1x1x160x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%78 = flow.dispatch.workgroups[%c960, %c9, %c9]() : () -> tensor<1x9x9x960xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x9x9x960xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<writeonly:1x9x9x960xf32>
flow.return
}
%79 = flow.dispatch.workgroups[%c960, %c9, %c9](%77, %78) : (tensor<1x7x7x960xf32>, tensor<1x9x9x960xf32>) -> %78 =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x9x9x960xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x7x7x960xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x9x9x960xf32> -> tensor<1x9x9x960xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<readwrite:1x9x9x960xf32>
flow.return
}
%80 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_54, %cst_56, %cst_238, %cst_55, %79, %cst_259) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x9x9x960xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c960_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x9x9x960xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x960xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%81 = flow.dispatch.workgroups[%c160, %c7, %c7](%76, %cst_61, %cst_63, %cst_239, %cst_62, %80, %cst_64) : (tensor<1x7x7x160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<160xf32>, tensor<1x7x7x960xf32>, tensor<1x1x960x160xf32>) -> tensor<1x7x7x160xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg2: !flow.dispatch.tensor<readonly:160xf32>, %arg3: !flow.dispatch.tensor<readonly:160xf32>, %arg4: !flow.dispatch.tensor<readonly:160xf32>, %arg5: !flow.dispatch.tensor<readonly:160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg7: !flow.dispatch.tensor<readonly:1x1x960x160xf32>, %arg8: !flow.dispatch.tensor<writeonly:1x7x7x160xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c160_264 = constant 160 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg9 = %94 to %c7_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg10 = %96 to %c7_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg11 = %98 to %c160_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %100, %101, %102], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg11], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg11], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg11], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%111 = flow.dispatch.tensor.load %arg5, offsets = [%arg11], sizes = [%110], strides = [1] : !flow.dispatch.tensor<readonly:160xf32> -> tensor<?xf32>
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg10, %workgroup_size_1)
%114 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 160)>(%arg11, %workgroup_size_0)
%115 = linalg.init_tensor [1, %112, %113, %114] : tensor<1x?x?x?xf32>
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%118 = flow.dispatch.tensor.load %arg6, offsets = [0, %arg9, %arg10, 0], sizes = [1, %116, %117, 960], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x?x?x960xf32>
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%120 = flow.dispatch.tensor.load %arg7, offsets = [0, 0, 0, %arg11], sizes = [1, 1, 960, %119], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x960x160xf32> -> tensor<1x1x960x?xf32>
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_2)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg10, %workgroup_size_1)
%123 = affine.min affine_map<(d0, d1) -> (-d0 + 160, d1)>(%arg11, %workgroup_size_0)
%124 = linalg.init_tensor [1, %121, %122, %123] : tensor<1x?x?x?xf32>
%125 = linalg.fill(%124, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%126 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%118, %120 : tensor<1x?x?x960xf32>, tensor<1x1x960x?xf32>) outs(%125 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%127 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %126, %105, %107, %109, %111 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%115 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32, %arg17: f32, %arg18: f32): // no predecessors
%128 = subf %arg13, %arg14 : f32
%129 = mulf %128, %arg15 : f32
%130 = divf %129, %arg16 : f32
%131 = addf %130, %arg17 : f32
%132 = addf %arg12, %131 : f32
linalg.yield %132 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %127, %arg8, offsets = [0, %arg9, %arg10, %arg11], sizes = [1, %112, %113, %114], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x160xf32>
}
}
}
flow.return
}
%82 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_68, %cst_70, %cst_240, %cst_69, %81, %cst_71) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x7x7x160xf32>, tensor<1x1x160x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x160xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x160x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c960_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 160], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x160xf32> -> tensor<1x?x?x160xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 160, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x160x960xf32> -> tensor<1x1x160x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x160xf32>, tensor<1x1x160x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
%128 = cmpf olt, %127, %cst_263 : f32
%129 = select %128, %127, %cst_263 : f32
%130 = cmpf uno, %127, %cst_263 : f32
%131 = select %130, %cst_262, %129 : f32
%132 = cmpf ogt, %131, %cst_264 : f32
%133 = select %132, %131, %cst_264 : f32
%134 = cmpf uno, %131, %cst_264 : f32
%135 = select %134, %cst_262, %133 : f32
linalg.yield %135 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%83 = flow.dispatch.workgroups[%c960, %c9, %c9]() : () -> tensor<1x9x9x960xf32> =
(%arg1: !flow.dispatch.tensor<writeonly:1x9x9x960xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%94 = linalg.init_tensor [1, 9, 9, 960] : tensor<1x9x9x960xf32>
%95 = linalg.fill(%94, %cst_262) : tensor<1x9x9x960xf32>, f32 -> tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %95, %arg1, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<writeonly:1x9x9x960xf32>
flow.return
}
%84 = flow.dispatch.workgroups[%c960, %c9, %c9](%82, %83) : (tensor<1x7x7x960xf32>, tensor<1x9x9x960xf32>) -> %83 =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg2: !flow.dispatch.tensor<readwrite:1x9x9x960xf32>) {
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x7x7x960xf32>
%95 = flow.dispatch.tensor.load %arg2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:1x9x9x960xf32> -> tensor<1x9x9x960xf32>
%96 = subtensor_insert %94 into %95[0, 1, 1, 0] [1, 7, 7, 960] [1, 1, 1, 1] : tensor<1x7x7x960xf32> into tensor<1x9x9x960xf32>
flow.dispatch.tensor.store %96, %arg2, offsets = [], sizes = [], strides = [] : tensor<1x9x9x960xf32> -> !flow.dispatch.tensor<readwrite:1x9x9x960xf32>
flow.return
}
%85 = flow.dispatch.workgroups[%c960, %c7, %c7](%cst_65, %cst_67, %cst_241, %cst_66, %84, %cst_260) : (tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<960xf32>, tensor<1x9x9x960xf32>, tensor<3x3x960xf32>) -> tensor<1x7x7x960xf32> =
(%arg1: !flow.dispatch.tensor<readonly:960xf32>, %arg2: !flow.dispatch.tensor<readonly:960xf32>, %arg3: !flow.dispatch.tensor<readonly:960xf32>, %arg4: !flow.dispatch.tensor<readonly:960xf32>, %arg5: !flow.dispatch.tensor<readonly:1x9x9x960xf32>, %arg6: !flow.dispatch.tensor<readonly:3x3x960xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x960xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c7_265 = constant 7 : index
%c960_266 = constant 960 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_265 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_265 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c960_266 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%109 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%108], strides = [1] : !flow.dispatch.tensor<readonly:960xf32> -> tensor<?xf32>
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%111 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%112 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 960)>(%arg10, %workgroup_size_0)
%113 = linalg.init_tensor [1, %110, %111, %112] : tensor<1x?x?x?xf32>
%114 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg8, %100)
%115 = affine.min affine_map<(d0, d1) -> (d1 + 2, -d0 + 9)>(%arg9, %101)
%116 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%117 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %114, %115, %116], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x9x9x960xf32> -> tensor<1x?x?x?xf32>
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%119 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, %arg10], sizes = [3, 3, %118], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x960xf32> -> tensor<3x3x?xf32>
%120 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%121 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%122 = affine.min affine_map<(d0, d1) -> (-d0 + 960, d1)>(%arg10, %workgroup_size_0)
%123 = linalg.init_tensor [1, %120, %121, %122] : tensor<1x?x?x?xf32>
%124 = linalg.fill(%123, %cst_264) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%125 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : tensor<2xi64>} ins(%117, %119 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%124 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%126 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %103, %105, %107, %109 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%113 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%127 = subf %arg11, %arg12 : f32
%128 = mulf %127, %arg13 : f32
%129 = divf %128, %arg14 : f32
%130 = addf %129, %arg15 : f32
%131 = cmpf olt, %130, %cst_263 : f32
%132 = select %131, %130, %cst_263 : f32
%133 = cmpf uno, %130, %cst_263 : f32
%134 = select %133, %cst_262, %132 : f32
%135 = cmpf ogt, %134, %cst_264 : f32
%136 = select %135, %134, %cst_264 : f32
%137 = cmpf uno, %134, %cst_264 : f32
%138 = select %137, %cst_262, %136 : f32
linalg.yield %138 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %126, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %110, %111, %112], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x960xf32>
}
}
}
flow.return
}
%86 = flow.dispatch.workgroups[%c320, %c7, %c7](%cst_72, %cst_74, %cst_242, %cst_73, %85, %cst_75) : (tensor<320xf32>, tensor<320xf32>, tensor<320xf32>, tensor<320xf32>, tensor<1x7x7x960xf32>, tensor<1x1x960x320xf32>) -> tensor<1x7x7x320xf32> =
(%arg1: !flow.dispatch.tensor<readonly:320xf32>, %arg2: !flow.dispatch.tensor<readonly:320xf32>, %arg3: !flow.dispatch.tensor<readonly:320xf32>, %arg4: !flow.dispatch.tensor<readonly:320xf32>, %arg5: !flow.dispatch.tensor<readonly:1x7x7x960xf32>, %arg6: !flow.dispatch.tensor<readonly:1x1x960x320xf32>, %arg7: !flow.dispatch.tensor<writeonly:1x7x7x320xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c320_264 = constant 320 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg8 = %94 to %c7_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg9 = %96 to %c7_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg10 = %98 to %c320_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg1, offsets = [%arg10], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [%arg10], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg3, offsets = [%arg10], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg4, offsets = [%arg10], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:320xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg8, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg9, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 320)>(%arg10, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%113 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%114 = flow.dispatch.tensor.load %arg5, offsets = [0, %arg8, %arg9, 0], sizes = [1, %112, %113, 960], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x960xf32> -> tensor<1x?x?x960xf32>
%115 = affine.min affine_map<(d0, d1) -> (-d0 + 320, d1)>(%arg10, %workgroup_size_0)
%116 = flow.dispatch.tensor.load %arg6, offsets = [0, 0, 0, %arg10], sizes = [1, 1, 960, %115], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x960x320xf32> -> tensor<1x1x960x?xf32>
%117 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg8, %workgroup_size_2)
%118 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg9, %workgroup_size_1)
%119 = affine.min affine_map<(d0, d1) -> (-d0 + 320, d1)>(%arg10, %workgroup_size_0)
%120 = linalg.init_tensor [1, %117, %118, %119] : tensor<1x?x?x?xf32>
%121 = linalg.fill(%120, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%122 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%114, %116 : tensor<1x?x?x960xf32>, tensor<1x1x960x?xf32>) outs(%121 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%122, %101, %103, %105, %107 : tensor<1x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<1x?x?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg11: f32, %arg12: f32, %arg13: f32, %arg14: f32, %arg15: f32, %arg16: f32): // no predecessors
%124 = subf %arg11, %arg12 : f32
%125 = mulf %124, %arg13 : f32
%126 = divf %125, %arg14 : f32
%127 = addf %126, %arg15 : f32
linalg.yield %127 : f32
} -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %123, %arg7, offsets = [0, %arg8, %arg9, %arg10], sizes = [1, %108, %109, %110], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x320xf32>
}
}
}
flow.return
}
%87 = flow.dispatch.workgroups[%c1280, %c7, %c7](%86, %cst_182) : (tensor<1x7x7x320xf32>, tensor<1x1x320x1280xf32>) -> tensor<1x7x7x1280xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x320xf32>, %arg2: !flow.dispatch.tensor<readonly:1x1x320x1280xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x7x7x1280xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c7_263 = constant 7 : index
%c1280_264 = constant 1280 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg4 = %94 to %c7_263 step %95 {
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg5 = %96 to %c7_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg6 = %98 to %c1280_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg4, %workgroup_size_2)
%101 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg5, %workgroup_size_1)
%102 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4, %arg5, 0], sizes = [1, %100, %101, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x7x7x320xf32> -> tensor<1x?x?x320xf32>
%103 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg6, %workgroup_size_0)
%104 = flow.dispatch.tensor.load %arg2, offsets = [0, 0, 0, %arg6], sizes = [1, 1, 320, %103], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x1x320x1280xf32> -> tensor<1x1x320x?xf32>
%105 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg4, %workgroup_size_2)
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 7)>(%arg5, %workgroup_size_1)
%107 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg6, %workgroup_size_0)
%108 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg4, %workgroup_size_2)
%109 = affine.min affine_map<(d0, d1) -> (-d0 + 7, d1)>(%arg5, %workgroup_size_1)
%110 = affine.min affine_map<(d0, d1) -> (-d0 + 1280, d1)>(%arg6, %workgroup_size_0)
%111 = linalg.init_tensor [1, %108, %109, %110] : tensor<1x?x?x?xf32>
%112 = linalg.fill(%111, %cst_262) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%113 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%102, %104 : tensor<1x?x?x320xf32>, tensor<1x1x320x?xf32>) outs(%112 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %113, %arg3, offsets = [0, %arg4, %arg5, %arg6], sizes = [1, %105, %106, %107], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x1280xf32>
}
}
}
flow.return
}
%88 = flow.dispatch.workgroups[%c1280, %c1, %c1](%87, %cst_179, %cst_181, %cst_243, %cst_180) : (tensor<1x7x7x1280xf32>, tensor<1280xf32>, tensor<1280xf32>, tensor<1280xf32>, tensor<1280xf32>) -> tensor<1280xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1x7x7x1280xf32>, %arg2: !flow.dispatch.tensor<readonly:1280xf32>, %arg3: !flow.dispatch.tensor<readonly:1280xf32>, %arg4: !flow.dispatch.tensor<readonly:1280xf32>, %arg5: !flow.dispatch.tensor<readonly:1280xf32>, %arg6: !flow.dispatch.tensor<writeonly:1280xf32>) {
%cst_262 = constant 0x7FC00000 : f32
%cst_263 = constant 6.000000e+00 : f32
%cst_264 = constant 0.000000e+00 : f32
%c1280_265 = constant 1280 : index
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x7x7x1280xf32> -> tensor<1x7x7x1280xf32>
%95 = linalg.tensor_reshape %94 [affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d2)>, affine_map<(d0, d1, d2, d3) -> (d3)>] : tensor<1x7x7x1280xf32> into tensor<7x7x1280xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %96 to %c1280_265 step %97 {
%98 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%99 = subtensor %95[0, 0, %arg7] [7, 7, %98] [1, 1, 1] : tensor<7x7x1280xf32> to tensor<7x7x?xf32>
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%101 = flow.dispatch.tensor.load %arg2, offsets = [%arg7], sizes = [%100], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg3, offsets = [%arg7], sizes = [%102], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%105 = flow.dispatch.tensor.load %arg4, offsets = [%arg7], sizes = [%104], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%106 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%107 = flow.dispatch.tensor.load %arg5, offsets = [%arg7], sizes = [%106], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%108 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg7, %workgroup_size_0)
%109 = affine.min affine_map<(d0, d1) -> (-d0 + 1280, d1)>(%arg7, %workgroup_size_0)
%110 = linalg.init_tensor [%109] : tensor<?xf32>
%111 = linalg.fill(%110, %cst_264) : tensor<?xf32>, f32 -> tensor<?xf32>
%112 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%99, %101, %103, %105, %107 : tensor<7x7x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%111 : tensor<?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32): // no predecessors
%113 = subf %arg8, %arg9 : f32
%114 = mulf %113, %arg10 : f32
%115 = divf %114, %arg11 : f32
%116 = addf %115, %arg12 : f32
%117 = cmpf olt, %116, %cst_263 : f32
%118 = select %117, %116, %cst_263 : f32
%119 = cmpf uno, %116, %cst_263 : f32
%120 = select %119, %cst_262, %118 : f32
%121 = cmpf ogt, %120, %cst_264 : f32
%122 = select %121, %120, %cst_264 : f32
%123 = cmpf uno, %120, %cst_264 : f32
%124 = select %123, %cst_262, %122 : f32
%125 = addf %124, %arg13 : f32
linalg.yield %125 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %112, %arg6, offsets = [%arg7], sizes = [%108], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:1280xf32>
}
flow.return
}
%89 = flow.dispatch.workgroups[%c1280, %c1, %c1](%88) : (tensor<1280xf32>) -> tensor<1280xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1280xf32>, %arg2: !flow.dispatch.tensor<writeonly:1280xf32>) {
%cst_262 = constant 4.900000e+01 : f32
%c1280_263 = constant 1280 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%94 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%95 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg3 = %94 to %c1280_263 step %95 {
%96 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg3, %workgroup_size_0)
%97 = flow.dispatch.tensor.load %arg1, offsets = [%arg3], sizes = [%96], strides = [1] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<?xf32>
%98 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1280)>(%arg3, %workgroup_size_0)
%99 = linalg.init_tensor [%98] : tensor<?xf32>
%100 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%97 : tensor<?xf32>) outs(%99 : tensor<?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg4: f32, %arg5: f32): // no predecessors
%101 = divf %arg4, %cst_262 : f32
linalg.yield %101 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %100, %arg2, offsets = [%arg3], sizes = [%98], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:1280xf32>
}
flow.return
}
%90 = flow.dispatch.workgroups[%c1000, %c1, %c1](%89, %cst_191) : (tensor<1280xf32>, tensor<1280x1000xf32>) -> tensor<1x1000xf32> =
(%arg1: !flow.dispatch.tensor<readonly:1280xf32>, %arg2: !flow.dispatch.tensor<readonly:1280x1000xf32>, %arg3: !flow.dispatch.tensor<writeonly:1x1000xf32>) {
%cst_262 = constant 0.000000e+00 : f32
%c1_263 = constant 1 : index
%c1000_264 = constant 1000 : index
%94 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1280xf32> -> tensor<1280xf32>
%95 = linalg.tensor_reshape %94 [affine_map<(d0, d1) -> (d0, d1)>] : tensor<1280xf32> into tensor<1x1280xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%96 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%97 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %96 to %c1_263 step %97 {
%98 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%99 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %98 to %c1000_264 step %99 {
%100 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg4, %workgroup_size_1)
%101 = subtensor %95[%arg4, 0] [%100, 1280] [1, 1] : tensor<1x1280xf32> to tensor<?x1280xf32>
%102 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1000)>(%arg5, %workgroup_size_0)
%103 = flow.dispatch.tensor.load %arg2, offsets = [0, %arg5], sizes = [1280, %102], strides = [1, 1] : !flow.dispatch.tensor<readonly:1280x1000xf32> -> tensor<1280x?xf32>
%104 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 1)>(%arg4, %workgroup_size_1)
%105 = affine.min affine_map<(d0,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment