yzhang93/gist:456640440608e48550308bf87245523c Secret

## gistfile1.txt
// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_0(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_0(%input0: tensor<576xi8>) -> (%output0: tensor<576xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576xi8>
  %1 = tensor.empty() : tensor<576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<576xi8>) outs(%1 : tensor<576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_3(%input0: tensor<192xi8>) -> (%output0: tensor<192xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192xi8>
  %1 = tensor.empty() : tensor<192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<192xi8>) outs(%1 : tensor<192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_10(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_10(%input0: tensor<96x3x3xi8>) -> (%output0: tensor<96x3x3xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x3x3xi8>
  %1 = tensor.empty() : tensor<96x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<96x3x3xi8>) outs(%1 : tensor<96x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_2(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_2(%input0: tensor<144xi8>) -> (%output0: tensor<144xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144xi8>
  %1 = tensor.empty() : tensor<144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<144xi8>) outs(%1 : tensor<144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_1(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_1(%input0: tensor<960xi8>) -> (%output0: tensor<960xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960xi8>
  %1 = tensor.empty() : tensor<960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<960xi8>) outs(%1 : tensor<960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_4(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_4(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_6(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_6(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_7(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_7(%input0: tensor<32x3x3x3xi8>) -> (%output0: tensor<3x3x3x32xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x3x3x3xi8>
  %1 = tensor.empty() : tensor<3x3x3x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d2, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x3x3x3xi8>) outs(%1 : tensor<3x3x3x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<3x3x3x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<3x3x3x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_5(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_5(%input0: tensor<256xi8>) -> (%output0: tensor<256xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256xi8>
  %1 = tensor.empty() : tensor<256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<256xi8>) outs(%1 : tensor<256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_9(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_9(%input0: tensor<16xi8>) -> (%output0: tensor<16xf32>)"}} {
  %cst = arith.constant 2.500000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16xi8>
  %1 = tensor.empty() : tensor<16xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<16xi8>) outs(%1 : tensor<16xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<16xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<16xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_11(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_11(%input0: tensor<24xi8>) -> (%output0: tensor<24xf32>)"}} {
  %cst = arith.constant 5.000000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24xi8>
  %1 = tensor.empty() : tensor<24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<24xi8>) outs(%1 : tensor<24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_8(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_8(%input0: tensor<32x3x3xi8>) -> (%output0: tensor<32x3x3xf32>)"}} {
  %cst = arith.constant 5.000000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x3x3xi8>
  %1 = tensor.empty() : tensor<32x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<32x3x3xi8>) outs(%1 : tensor<32x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_12(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_12(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_13(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_13(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_18(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_18(%input0: tensor<576xi8>) -> (%output0: tensor<576xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576xi8>
  %1 = tensor.empty() : tensor<576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<576xi8>) outs(%1 : tensor<576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_21(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_21(%input0: tensor<576xi8>) -> (%output0: tensor<576xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576xi8>
  %1 = tensor.empty() : tensor<576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<576xi8>) outs(%1 : tensor<576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_14(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_14(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_23(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_23(%input0: tensor<960x3x3xi8>) -> (%output0: tensor<960x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x3x3xi8>
  %1 = tensor.empty() : tensor<960x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<960x3x3xi8>) outs(%1 : tensor<960x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_15(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_15(%input0: tensor<576x3x3xi8>) -> (%output0: tensor<576x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x3x3xi8>
  %1 = tensor.empty() : tensor<576x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<576x3x3xi8>) outs(%1 : tensor<576x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_19(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_19(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_26(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_26(%input0: tensor<960xi8>) -> (%output0: tensor<960xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960xi8>
  %1 = tensor.empty() : tensor<960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<960xi8>) outs(%1 : tensor<960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_27(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_27(%input0: tensor<160xi8>) -> (%output0: tensor<160xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160xi8>
  %1 = tensor.empty() : tensor<160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<160xi8>) outs(%1 : tensor<160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_17(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_17(%input0: tensor<576x3x3xi8>) -> (%output0: tensor<576x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x3x3xi8>
  %1 = tensor.empty() : tensor<576x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<576x3x3xi8>) outs(%1 : tensor<576x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_16(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_16(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_28(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_28(%input0: tensor<960x3x3xi8>) -> (%output0: tensor<960x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x3x3xi8>
  %1 = tensor.empty() : tensor<960x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<960x3x3xi8>) outs(%1 : tensor<960x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_22(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_22(%input0: tensor<160xi8>) -> (%output0: tensor<160xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160xi8>
  %1 = tensor.empty() : tensor<160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<160xi8>) outs(%1 : tensor<160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_20(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_20(%input0: tensor<576x3x3xi8>) -> (%output0: tensor<576x3x3xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x3x3xi8>
  %1 = tensor.empty() : tensor<576x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<576x3x3xi8>) outs(%1 : tensor<576x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_25(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_25(%input0: tensor<960x3x3xi8>) -> (%output0: tensor<960x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x3x3xi8>
  %1 = tensor.empty() : tensor<960x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<960x3x3xi8>) outs(%1 : tensor<960x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_24(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_24(%input0: tensor<160xi8>) -> (%output0: tensor<160xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160xi8>
  %1 = tensor.empty() : tensor<160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<160xi8>) outs(%1 : tensor<160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_31(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_31(%input0: tensor<144x3x3xi8>) -> (%output0: tensor<144x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x3x3xi8>
  %1 = tensor.empty() : tensor<144x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<144x3x3xi8>) outs(%1 : tensor<144x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_29(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_29(%input0: tensor<960xi8>) -> (%output0: tensor<960xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960xi8>
  %1 = tensor.empty() : tensor<960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<960xi8>) outs(%1 : tensor<960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_33(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_33(%input0: tensor<144x3x3xi8>) -> (%output0: tensor<144x3x3xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x3x3xi8>
  %1 = tensor.empty() : tensor<144x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<144x3x3xi8>) outs(%1 : tensor<144x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_34(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_34(%input0: tensor<144xi8>) -> (%output0: tensor<144xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144xi8>
  %1 = tensor.empty() : tensor<144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<144xi8>) outs(%1 : tensor<144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_39(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_39(%input0: tensor<192xi8>) -> (%output0: tensor<192xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192xi8>
  %1 = tensor.empty() : tensor<192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<192xi8>) outs(%1 : tensor<192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_41(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_41(%input0: tensor<192x3x3xi8>) -> (%output0: tensor<192x3x3xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x3x3xi8>
  %1 = tensor.empty() : tensor<192x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<192x3x3xi8>) outs(%1 : tensor<192x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_40(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_40(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_30(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_30(%input0: tensor<320xi8>) -> (%output0: tensor<320xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<320xi8>
  %1 = tensor.empty() : tensor<320xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<320xi8>) outs(%1 : tensor<320xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<320xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<320xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_32(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_32(%input0: tensor<24xi8>) -> (%output0: tensor<24xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24xi8>
  %1 = tensor.empty() : tensor<24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<24xi8>) outs(%1 : tensor<24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_35(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_35(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 2.500000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_36(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_36(%input0: tensor<192x3x3xi8>) -> (%output0: tensor<192x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x3x3xi8>
  %1 = tensor.empty() : tensor<192x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<192x3x3xi8>) outs(%1 : tensor<192x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_38(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_38(%input0: tensor<192x3x3xi8>) -> (%output0: tensor<192x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x3x3xi8>
  %1 = tensor.empty() : tensor<192x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<192x3x3xi8>) outs(%1 : tensor<192x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_37(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_37(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_45(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_45(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_46(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_46(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_43(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_43(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 2.500000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_42(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_42(%input0: tensor<192xi8>) -> (%output0: tensor<192xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192xi8>
  %1 = tensor.empty() : tensor<192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<192xi8>) outs(%1 : tensor<192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_47(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_47(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_48(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_48(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_44(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_44(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_49(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_49(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_51(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_51(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_52(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_52(%input0: tensor<256xi8>) -> (%output0: tensor<256xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256xi8>
  %1 = tensor.empty() : tensor<256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<256xi8>) outs(%1 : tensor<256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_58(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_58(%input0: tensor<144x1x1x24xi8>) -> (%output0: tensor<1x1x24x144xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x1x1x24xi8>
  %1 = tensor.empty() : tensor<1x1x24x144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<144x1x1x24xi8>) outs(%1 : tensor<1x1x24x144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x24x144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x24x144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_50(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_50(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_56(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_56(%input0: tensor<96x1x1x16xi8>) -> (%output0: tensor<1x1x16x96xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x16xi8>
  %1 = tensor.empty() : tensor<1x1x16x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x16xi8>) outs(%1 : tensor<1x1x16x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x16x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x16x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_60(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_60(%input0: tensor<144x1x1x24xi8>) -> (%output0: tensor<1x1x24x144xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x1x1x24xi8>
  %1 = tensor.empty() : tensor<1x1x24x144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<144x1x1x24xi8>) outs(%1 : tensor<1x1x24x144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x24x144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x24x144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_62(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_62(%input0: tensor<192x1x1x32xi8>) -> (%output0: tensor<1x1x32x192xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<192x1x1x32xi8>) outs(%1 : tensor<1x1x32x192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_55(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_55(%input0: tensor<16x1x1x32xi8>) -> (%output0: tensor<1x1x32x16xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x16xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<16x1x1x32xi8>) outs(%1 : tensor<1x1x32x16xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x16xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x16xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_61(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_61(%input0: tensor<32x1x1x144xi8>) -> (%output0: tensor<1x1x144x32xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x1x1x144xi8>
  %1 = tensor.empty() : tensor<1x1x144x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x1x1x144xi8>) outs(%1 : tensor<1x1x144x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x144x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x144x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_54(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_54(%input0: tensor<21xi8>) -> (%output0: tensor<21xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<21xi8>
  %1 = tensor.empty() : tensor<21xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<21xi8>) outs(%1 : tensor<21xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<21xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<21xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_53(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_53(%input0: tensor<256xi8>) -> (%output0: tensor<256xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256xi8>
  %1 = tensor.empty() : tensor<256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<256xi8>) outs(%1 : tensor<256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_64(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_64(%input0: tensor<192x1x1x32xi8>) -> (%output0: tensor<1x1x32x192xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<192x1x1x32xi8>) outs(%1 : tensor<1x1x32x192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_63(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_63(%input0: tensor<32x1x1x192xi8>) -> (%output0: tensor<1x1x192x32xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x1x1x192xi8>
  %1 = tensor.empty() : tensor<1x1x192x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x1x1x192xi8>) outs(%1 : tensor<1x1x192x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x192x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x192x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_59(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_59(%input0: tensor<24x1x1x144xi8>) -> (%output0: tensor<1x1x144x24xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24x1x1x144xi8>
  %1 = tensor.empty() : tensor<1x1x144x24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<24x1x1x144xi8>) outs(%1 : tensor<1x1x144x24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x144x24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x144x24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_57(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_57(%input0: tensor<24x1x1x96xi8>) -> (%output0: tensor<1x1x96x24xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<24x1x1x96xi8>) outs(%1 : tensor<1x1x96x24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_67(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_67(%input0: tensor<64x1x1x192xi8>) -> (%output0: tensor<1x1x192x64xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x192xi8>
  %1 = tensor.empty() : tensor<1x1x192x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x192xi8>) outs(%1 : tensor<1x1x192x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x192x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x192x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_65(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_65(%input0: tensor<32x1x1x192xi8>) -> (%output0: tensor<1x1x192x32xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x1x1x192xi8>
  %1 = tensor.empty() : tensor<1x1x192x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x1x1x192xi8>) outs(%1 : tensor<1x1x192x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x192x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x192x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_68(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_68(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_71(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_71(%input0: tensor<64x1x1x384xi8>) -> (%output0: tensor<1x1x384x64xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x384xi8>) outs(%1 : tensor<1x1x384x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_70(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_70(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_66(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_66(%input0: tensor<192x1x1x32xi8>) -> (%output0: tensor<1x1x32x192xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<192x1x1x32xi8>) outs(%1 : tensor<1x1x32x192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_78(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_78(%input0: tensor<576x1x1x96xi8>) -> (%output0: tensor<1x1x96x576xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<576x1x1x96xi8>) outs(%1 : tensor<1x1x96x576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_77(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_77(%input0: tensor<96x1x1x576xi8>) -> (%output0: tensor<1x1x576x96xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x576xi8>
  %1 = tensor.empty() : tensor<1x1x576x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x576xi8>) outs(%1 : tensor<1x1x576x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x576x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x576x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_80(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_80(%input0: tensor<576x1x1x96xi8>) -> (%output0: tensor<1x1x96x576xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<576x1x1x96xi8>) outs(%1 : tensor<1x1x96x576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_79(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_79(%input0: tensor<96x1x1x576xi8>) -> (%output0: tensor<1x1x576x96xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x576xi8>
  %1 = tensor.empty() : tensor<1x1x576x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x576xi8>) outs(%1 : tensor<1x1x576x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x576x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x576x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_69(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_69(%input0: tensor<64x1x1x384xi8>) -> (%output0: tensor<1x1x384x64xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x384xi8>) outs(%1 : tensor<1x1x384x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_81(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_81(%input0: tensor<160x1x1x576xi8>) -> (%output0: tensor<1x1x576x160xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160x1x1x576xi8>
  %1 = tensor.empty() : tensor<1x1x576x160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<160x1x1x576xi8>) outs(%1 : tensor<1x1x576x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x576x160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x576x160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_76(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_76(%input0: tensor<576x1x1x96xi8>) -> (%output0: tensor<1x1x96x576xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<576x1x1x96xi8>) outs(%1 : tensor<1x1x96x576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_72(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_72(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_75(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_75(%input0: tensor<96x1x1x384xi8>) -> (%output0: tensor<1x1x384x96xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x384xi8>) outs(%1 : tensor<1x1x384x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_74(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_74(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_83(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_83(%input0: tensor<160x1x1x960xi8>) -> (%output0: tensor<1x1x960x160xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160x1x1x960xi8>
  %1 = tensor.empty() : tensor<1x1x960x160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<160x1x1x960xi8>) outs(%1 : tensor<1x1x960x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x960x160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x960x160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_73(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_73(%input0: tensor<64x1x1x384xi8>) -> (%output0: tensor<1x1x384x64xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x384xi8>) outs(%1 : tensor<1x1x384x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_84(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_84(%input0: tensor<960x1x1x160xi8>) -> (%output0: tensor<1x1x160x960xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x1x1x160xi8>
  %1 = tensor.empty() : tensor<1x1x160x960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<960x1x1x160xi8>) outs(%1 : tensor<1x1x160x960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x160x960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x160x960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_82(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_82(%input0: tensor<960x1x1x160xi8>) -> (%output0: tensor<1x1x160x960xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x1x1x160xi8>
  %1 = tensor.empty() : tensor<1x1x160x960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<960x1x1x160xi8>) outs(%1 : tensor<1x1x160x960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x160x960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x160x960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_85(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_85(%input0: tensor<160x1x1x960xi8>) -> (%output0: tensor<1x1x960x160xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160x1x1x960xi8>
  %1 = tensor.empty() : tensor<1x1x960x160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<160x1x1x960xi8>) outs(%1 : tensor<1x1x960x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x960x160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x960x160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_86(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_86(%input0: tensor<960x1x1x160xi8>) -> (%output0: tensor<1x1x160x960xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x1x1x160xi8>
  %1 = tensor.empty() : tensor<1x1x160x960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<960x1x1x160xi8>) outs(%1 : tensor<1x1x160x960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x160x960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x160x960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_87(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_87(%input0: tensor<320x1x1x960xi8>) -> (%output0: tensor<1x1x960x320xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<320x1x1x960xi8>
  %1 = tensor.empty() : tensor<1x1x960x320xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<320x1x1x960xi8>) outs(%1 : tensor<1x1x960x320xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x960x320xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x960x320xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_89(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_89(%input0: tensor<256x1x1x320xi8>) -> (%output0: tensor<1x1x320x256xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256x1x1x320xi8>
  %1 = tensor.empty() : tensor<1x1x320x256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<256x1x1x320xi8>) outs(%1 : tensor<1x1x320x256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x320x256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x320x256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_88(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_88(%input0: tensor<256x1x1x320xi8>) -> (%output0: tensor<1x1x320x256xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256x1x1x320xi8>
  %1 = tensor.empty() : tensor<1x1x320x256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<256x1x1x320xi8>) outs(%1 : tensor<1x1x320x256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x320x256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x320x256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_91(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_91(%input0: tensor<21x1x1x256xi8>) -> (%output0: tensor<1x1x256x21xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<21x1x1x256xi8>
  %1 = tensor.empty() : tensor<1x1x256x21xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<21x1x1x256xi8>) outs(%1 : tensor<1x1x256x21xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x256x21xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x256x21xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_90(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_90(%input0: tensor<256x1x1x512xi8>) -> (%output0: tensor<1x1x512x256xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256x1x1x512xi8>
  %1 = tensor.empty() : tensor<1x1x512x256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<256x1x1x512xi8>) outs(%1 : tensor<1x1x512x256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x512x256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x512x256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @tf2onnx(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %0 = util.null : !hal.fence
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @tf2onnx$async(%arg0, %0, %fence) : (!hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @tf2onnx$async(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant dense_resource<__elided__> : tensor<1x21xf32>
  %cst_0 = arith.constant dense_resource<__elided__> : tensor<1x256x1x1xf32>
  %cst_1 = arith.constant 0.999259948 : f32
  %cst_2 = arith.constant 6.000000e+00 : f32
  %cst_3 = arith.constant 6.400000e+01 : f32
  %cst_4 = arith.constant -9.53674316E-7 : f32
  %cst_5 = arith.constant 7.89230776 : f32
  %cst_6 = arith.constant 6.500000e+01 : f32
  %cst_7 = arith.constant 7.812500e-03 : f32
  %cst_8 = arith.constant 1.250000e-01 : f32
  %cst_9 = arith.constant 2.500000e-01 : f32
  %cst_10 = arith.constant 6.250000e-02 : f32
  %cst_11 = arith.constant 3.125000e-02 : f32
  %cst_12 = arith.constant 5.000000e-01 : f32
  %cst_13 = arith.constant 1.000000e+00 : f32
  %cst_14 = arith.constant 0.000000e+00 : f32
  %cst_15 = arith.constant 1.270000e+02 : f32
  %cst_16 = arith.constant -1.280000e+02 : f32
  %cst_17 = arith.constant 4.225000e+03 : f32
  %c0 = arith.constant 0 : index
  %cst_18 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_19 = arith.constant dense_resource<__elided__> : tensor<576xf32>
  %cst_20 = arith.constant dense_resource<__elided__> : tensor<960xf32>
  %cst_21 = arith.constant dense_resource<__elided__> : tensor<144xf32>
  %cst_22 = arith.constant dense_resource<__elided__> : tensor<192xf32>
  %cst_23 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_24 = arith.constant dense_resource<__elided__> : tensor<256xf32>
  %cst_25 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_26 = arith.constant dense_resource<__elided__> : tensor<3x3x3x32xf32>
  %cst_27 = arith.constant dense_resource<__elided__> : tensor<32x3x3xf32>
  %cst_28 = arith.constant dense_resource<__elided__> : tensor<16xf32>
  %cst_29 = arith.constant dense_resource<__elided__> : tensor<96x3x3xf32>
  %cst_30 = arith.constant dense_resource<__elided__> : tensor<24xf32>
  %cst_31 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_32 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_33 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_34 = arith.constant dense_resource<__elided__> : tensor<576x3x3xf32>
  %cst_35 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_36 = arith.constant dense_resource<__elided__> : tensor<576x3x3xf32>
  %cst_37 = arith.constant dense_resource<__elided__> : tensor<576xf32>
  %cst_38 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_39 = arith.constant dense_resource<__elided__> : tensor<576x3x3xf32>
  %cst_40 = arith.constant dense_resource<__elided__> : tensor<576xf32>
  %cst_41 = arith.constant dense_resource<__elided__> : tensor<160xf32>
  %cst_42 = arith.constant dense_resource<__elided__> : tensor<960x3x3xf32>
  %cst_43 = arith.constant dense_resource<__elided__> : tensor<160xf32>
  %cst_44 = arith.constant dense_resource<__elided__> : tensor<960x3x3xf32>
  %cst_45 = arith.constant dense_resource<__elided__> : tensor<960xf32>
  %cst_46 = arith.constant dense_resource<__elided__> : tensor<160xf32>
  %cst_47 = arith.constant dense_resource<__elided__> : tensor<960x3x3xf32>
  %cst_48 = arith.constant dense_resource<__elided__> : tensor<960xf32>
  %cst_49 = arith.constant dense_resource<__elided__> : tensor<320xf32>
  %cst_50 = arith.constant dense_resource<__elided__> : tensor<144x3x3xf32>
  %cst_51 = arith.constant dense_resource<__elided__> : tensor<24xf32>
  %cst_52 = arith.constant dense_resource<__elided__> : tensor<144x3x3xf32>
  %cst_53 = arith.constant dense_resource<__elided__> : tensor<144xf32>
  %cst_54 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_55 = arith.constant dense_resource<__elided__> : tensor<192x3x3xf32>
  %cst_56 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_57 = arith.constant dense_resource<__elided__> : tensor<192x3x3xf32>
  %cst_58 = arith.constant dense_resource<__elided__> : tensor<192xf32>
  %cst_59 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_60 = arith.constant dense_resource<__elided__> : tensor<192x3x3xf32>
  %cst_61 = arith.constant dense_resource<__elided__> : tensor<192xf32>
  %cst_62 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_63 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_64 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_65 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_66 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_67 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_68 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_69 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_70 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_71 = arith.constant dense_resource<__elided__> : tensor<256xf32>
  %cst_72 = arith.constant dense_resource<__elided__> : tensor<1x1x32x16xf32>
  %cst_73 = arith.constant dense_resource<__elided__> : tensor<1x1x16x96xf32>
  %cst_74 = arith.constant dense_resource<__elided__> : tensor<1x1x96x24xf32>
  %cst_75 = arith.constant dense_resource<__elided__> : tensor<1x1x24x144xf32>
  %cst_76 = arith.constant dense_resource<__elided__> : tensor<1x1x144x24xf32>
  %cst_77 = arith.constant dense_resource<__elided__> : tensor<1x1x24x144xf32>
  %cst_78 = arith.constant dense_resource<__elided__> : tensor<1x1x144x32xf32>
  %cst_79 = arith.constant dense_resource<__elided__> : tensor<1x1x32x192xf32>
  %cst_80 = arith.constant dense_resource<__elided__> : tensor<1x1x192x32xf32>
  %cst_81 = arith.constant dense_resource<__elided__> : tensor<1x1x32x192xf32>
  %cst_82 = arith.constant dense_resource<__elided__> : tensor<1x1x192x32xf32>
  %cst_83 = arith.constant dense_resource<__elided__> : tensor<1x1x32x192xf32>
  %cst_84 = arith.constant dense_resource<__elided__> : tensor<1x1x192x64xf32>
  %cst_85 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_86 = arith.constant dense_resource<__elided__> : tensor<1x1x384x64xf32>
  %cst_87 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_88 = arith.constant dense_resource<__elided__> : tensor<1x1x384x64xf32>
  %cst_89 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_90 = arith.constant dense_resource<__elided__> : tensor<1x1x384x64xf32>
  %cst_91 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_92 = arith.constant dense_resource<__elided__> : tensor<1x1x384x96xf32>
  %cst_93 = arith.constant dense_resource<__elided__> : tensor<1x1x96x576xf32>
  %cst_94 = arith.constant dense_resource<__elided__> : tensor<1x1x576x96xf32>
  %cst_95 = arith.constant dense_resource<__elided__> : tensor<1x1x96x576xf32>
  %cst_96 = arith.constant dense_resource<__elided__> : tensor<1x1x576x96xf32>
  %cst_97 = arith.constant dense_resource<__elided__> : tensor<1x1x96x576xf32>
  %cst_98 = arith.constant dense_resource<__elided__> : tensor<1x1x576x160xf32>
  %cst_99 = arith.constant dense_resource<__elided__> : tensor<1x1x160x960xf32>
  %cst_100 = arith.constant dense_resource<__elided__> : tensor<1x1x960x160xf32>
  %cst_101 = arith.constant dense_resource<__elided__> : tensor<1x1x160x960xf32>
  %cst_102 = arith.constant dense_resource<__elided__> : tensor<1x1x960x160xf32>
  %cst_103 = arith.constant dense_resource<__elided__> : tensor<1x1x160x960xf32>
  %cst_104 = arith.constant dense_resource<__elided__> : tensor<1x1x960x320xf32>
  %cst_105 = arith.constant dense_resource<__elided__> : tensor<1x1x320x256xf32>
  %cst_106 = arith.constant dense_resource<__elided__> : tensor<1x1x320x256xf32>
  %cst_107 = arith.constant dense_resource<__elided__> : tensor<1x1x512x256xf32>
  %cst_108 = arith.constant dense_resource<__elided__> : tensor<1x1x256x21xf32>
  %0 = hal.tensor.import wait(%arg1) => %arg0 : !hal.buffer_view -> tensor<1x513x513x3xf32>
  %expanded = tensor.expand_shape %0 [[0], [1], [2], [3, 4]] output_shape [1, 513, 513, 1, 3] : tensor<1x513x513x3xf32> into tensor<1x513x513x1x3xf32>
  %1 = tensor.empty() : tensor<1x513x513x1x3xf32>
  %2 = flow.dispatch.region -> (tensor<1x513x513x1x3xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<1x513x513x1x3xf32>) outs(%1 : tensor<1x513x513x1x3xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_7 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_7 : f32
      linalg.yield %248 : f32
    } -> tensor<1x513x513x1x3xf32>
    flow.return %237 : tensor<1x513x513x1x3xf32>
  }
  %collapsed = tensor.collapse_shape %2 [[0, 1], [2, 3], [4]] : tensor<1x513x513x1x3xf32> into tensor<513x513x3xf32>
  %3 = tensor.empty() : tensor<515x515x3xf32>
  %4 = linalg.fill ins(%cst_14 : f32) outs(%3 : tensor<515x515x3xf32>) -> tensor<515x515x3xf32>
  %inserted_slice = tensor.insert_slice %collapsed into %4[1, 1, 0] [513, 513, 3] [1, 1, 1] : tensor<513x513x3xf32> into tensor<515x515x3xf32>
  %expanded_109 = tensor.expand_shape %inserted_slice [[0, 1], [2], [3]] output_shape [1, 515, 515, 3] : tensor<515x515x3xf32> into tensor<1x515x515x3xf32>
  %5 = tensor.empty() : tensor<1x257x257x32xf32>
  %6 = linalg.fill ins(%cst_14 : f32) outs(%5 : tensor<1x257x257x32xf32>) -> tensor<1x257x257x32xf32>
  %7 = flow.dispatch.region -> (tensor<1x257x257x32xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%expanded_109, %cst_26 : tensor<1x515x515x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x257x257x32xf32>) -> tensor<1x257x257x32xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_25 : tensor<1x257x257x32xf32>, tensor<32xf32>) outs(%5 : tensor<1x257x257x32xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x257x257x32xf32>
    flow.return %238 : tensor<1x257x257x32xf32>
  }
  %collapsed_110 = tensor.collapse_shape %7 [[0, 1], [2], [3]] : tensor<1x257x257x32xf32> into tensor<257x257x32xf32>
  %8 = tensor.empty() : tensor<259x259x32xf32>
  %9 = linalg.fill ins(%cst_14 : f32) outs(%8 : tensor<259x259x32xf32>) -> tensor<259x259x32xf32>
  %inserted_slice_111 = tensor.insert_slice %collapsed_110 into %9[1, 1, 0] [257, 257, 32] [1, 1, 1] : tensor<257x257x32xf32> into tensor<259x259x32xf32>
  %expanded_112 = tensor.expand_shape %inserted_slice_111 [[0], [1], [2, 3]] output_shape [259, 259, 1, 32] : tensor<259x259x32xf32> into tensor<259x259x1x32xf32>
  %10 = tensor.empty() : tensor<1x32x259x259xf32>
  %11 = flow.dispatch.region -> (tensor<1x32x259x259xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_112 : tensor<259x259x1x32xf32>) outs(%10 : tensor<1x32x259x259xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x32x259x259xf32>
    flow.return %237 : tensor<1x32x259x259xf32>
  }
  %12 = tensor.empty() : tensor<1x32x257x257xf32>
  %13 = linalg.fill ins(%cst_14 : f32) outs(%12 : tensor<1x32x257x257xf32>) -> tensor<1x32x257x257xf32>
  %14 = flow.dispatch.region -> (tensor<1x32x257x257xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%11, %cst_27 : tensor<1x32x259x259xf32>, tensor<32x3x3xf32>) outs(%13 : tensor<1x32x257x257xf32>) -> tensor<1x32x257x257xf32>
    flow.return %237 : tensor<1x32x257x257xf32>
  }
  %15 = tensor.empty() : tensor<257x257x1x32xf32>
  %16 = flow.dispatch.region -> (tensor<257x257x1x32xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14 : tensor<1x32x257x257xf32>) outs(%15 : tensor<257x257x1x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<257x257x1x32xf32>
    flow.return %237 : tensor<257x257x1x32xf32>
  }
  %collapsed_113 = tensor.collapse_shape %16 [[0], [1], [2, 3]] : tensor<257x257x1x32xf32> into tensor<257x257x32xf32>
  %expanded_114 = tensor.expand_shape %collapsed_113 [[0, 1], [2], [3]] output_shape [1, 257, 257, 32] : tensor<257x257x32xf32> into tensor<1x257x257x32xf32>
  %17 = flow.dispatch.region -> (tensor<1x257x257x32xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_114 : tensor<1x257x257x32xf32>) outs(%5 : tensor<1x257x257x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x257x257x32xf32>
    flow.return %237 : tensor<1x257x257x32xf32>
  }
  %18 = tensor.empty() : tensor<1x257x257x16xf32>
  %19 = linalg.fill ins(%cst_14 : f32) outs(%18 : tensor<1x257x257x16xf32>) -> tensor<1x257x257x16xf32>
  %20 = flow.dispatch.region -> (tensor<1x257x257x16xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%17, %cst_72 : tensor<1x257x257x32xf32>, tensor<1x1x32x16xf32>) outs(%19 : tensor<1x257x257x16xf32>) -> tensor<1x257x257x16xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_28 : tensor<1x257x257x16xf32>, tensor<16xf32>) outs(%18 : tensor<1x257x257x16xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_9 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_9 : f32
      linalg.yield %250 : f32
    } -> tensor<1x257x257x16xf32>
    flow.return %238 : tensor<1x257x257x16xf32>
  }
  %21 = tensor.empty() : tensor<1x257x257x96xf32>
  %22 = linalg.fill ins(%cst_14 : f32) outs(%21 : tensor<1x257x257x96xf32>) -> tensor<1x257x257x96xf32>
  %23 = flow.dispatch.region -> (tensor<1x257x257x96xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%20, %cst_73 : tensor<1x257x257x16xf32>, tensor<1x1x16x96xf32>) outs(%22 : tensor<1x257x257x96xf32>) -> tensor<1x257x257x96xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_18 : tensor<1x257x257x96xf32>, tensor<96xf32>) outs(%21 : tensor<1x257x257x96xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x257x257x96xf32>
    flow.return %238 : tensor<1x257x257x96xf32>
  }
  %collapsed_115 = tensor.collapse_shape %23 [[0, 1], [2], [3]] : tensor<1x257x257x96xf32> into tensor<257x257x96xf32>
  %24 = tensor.empty() : tensor<259x259x96xf32>
  %25 = linalg.fill ins(%cst_14 : f32) outs(%24 : tensor<259x259x96xf32>) -> tensor<259x259x96xf32>
  %inserted_slice_116 = tensor.insert_slice %collapsed_115 into %25[1, 1, 0] [257, 257, 96] [1, 1, 1] : tensor<257x257x96xf32> into tensor<259x259x96xf32>
  %expanded_117 = tensor.expand_shape %inserted_slice_116 [[0], [1], [2, 3]] output_shape [259, 259, 1, 96] : tensor<259x259x96xf32> into tensor<259x259x1x96xf32>
  %26 = tensor.empty() : tensor<1x96x259x259xf32>
  %27 = flow.dispatch.region -> (tensor<1x96x259x259xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_117 : tensor<259x259x1x96xf32>) outs(%26 : tensor<1x96x259x259xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x96x259x259xf32>
    flow.return %237 : tensor<1x96x259x259xf32>
  }
  %28 = tensor.empty() : tensor<1x96x129x129xf32>
  %29 = linalg.fill ins(%cst_14 : f32) outs(%28 : tensor<1x96x129x129xf32>) -> tensor<1x96x129x129xf32>
  %30 = flow.dispatch.region -> (tensor<1x96x129x129xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%27, %cst_29 : tensor<1x96x259x259xf32>, tensor<96x3x3xf32>) outs(%29 : tensor<1x96x129x129xf32>) -> tensor<1x96x129x129xf32>
    flow.return %237 : tensor<1x96x129x129xf32>
  }
  %31 = tensor.empty() : tensor<129x129x1x96xf32>
  %32 = flow.dispatch.region -> (tensor<129x129x1x96xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<1x96x129x129xf32>) outs(%31 : tensor<129x129x1x96xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<129x129x1x96xf32>
    flow.return %237 : tensor<129x129x1x96xf32>
  }
  %collapsed_118 = tensor.collapse_shape %32 [[0], [1], [2, 3]] : tensor<129x129x1x96xf32> into tensor<129x129x96xf32>
  %expanded_119 = tensor.expand_shape %collapsed_118 [[0, 1], [2], [3]] output_shape [1, 129, 129, 96] : tensor<129x129x96xf32> into tensor<1x129x129x96xf32>
  %33 = tensor.empty() : tensor<1x129x129x96xf32>
  %34 = flow.dispatch.region -> (tensor<1x129x129x96xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_119 : tensor<1x129x129x96xf32>) outs(%33 : tensor<1x129x129x96xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x129x129x96xf32>
    flow.return %237 : tensor<1x129x129x96xf32>
  }
  %35 = tensor.empty() : tensor<129x129x24xf32>
  %36 = tensor.empty() : tensor<1x129x129x24xf32>
  %37 = linalg.fill ins(%cst_14 : f32) outs(%36 : tensor<1x129x129x24xf32>) -> tensor<1x129x129x24xf32>
  %38 = tensor.empty() : tensor<1x129x129x24xi8>
  %39 = flow.dispatch.region -> (tensor<1x129x129x24xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%34, %cst_74 : tensor<1x129x129x96xf32>, tensor<1x1x96x24xf32>) outs(%37 : tensor<1x129x129x24xf32>) -> tensor<1x129x129x24xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_30 : tensor<1x129x129x24xf32>, tensor<24xf32>) outs(%38 : tensor<1x129x129x24xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_9 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      linalg.yield %247 : i8
    } -> tensor<1x129x129x24xi8>
    flow.return %238 : tensor<1x129x129x24xi8>
  }
  %collapsed_120 = tensor.collapse_shape %39 [[0, 1], [2], [3]] : tensor<1x129x129x24xi8> into tensor<129x129x24xi8>
  %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_120 : tensor<129x129x24xi8>) outs(%35 : tensor<129x129x24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_9 : f32
    linalg.yield %239 : f32
  } -> tensor<129x129x24xf32>
  %expanded_121 = tensor.expand_shape %40 [[0, 1], [2], [3]] output_shape [1, 129, 129, 24] : tensor<129x129x24xf32> into tensor<1x129x129x24xf32>
  %41 = tensor.empty() : tensor<1x129x129x144xf32>
  %42 = linalg.fill ins(%cst_14 : f32) outs(%41 : tensor<1x129x129x144xf32>) -> tensor<1x129x129x144xf32>
  %43 = flow.dispatch.region -> (tensor<1x129x129x144xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_121, %cst_75 : tensor<1x129x129x24xf32>, tensor<1x1x24x144xf32>) outs(%42 : tensor<1x129x129x144xf32>) -> tensor<1x129x129x144xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_21 : tensor<1x129x129x144xf32>, tensor<144xf32>) outs(%41 : tensor<1x129x129x144xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x129x129x144xf32>
    flow.return %238 : tensor<1x129x129x144xf32>
  }
  %collapsed_122 = tensor.collapse_shape %43 [[0, 1], [2], [3]] : tensor<1x129x129x144xf32> into tensor<129x129x144xf32>
  %44 = tensor.empty() : tensor<131x131x144xf32>
  %45 = linalg.fill ins(%cst_14 : f32) outs(%44 : tensor<131x131x144xf32>) -> tensor<131x131x144xf32>
  %inserted_slice_123 = tensor.insert_slice %collapsed_122 into %45[1, 1, 0] [129, 129, 144] [1, 1, 1] : tensor<129x129x144xf32> into tensor<131x131x144xf32>
  %expanded_124 = tensor.expand_shape %inserted_slice_123 [[0], [1], [2, 3]] output_shape [131, 131, 1, 144] : tensor<131x131x144xf32> into tensor<131x131x1x144xf32>
  %46 = tensor.empty() : tensor<1x144x131x131xf32>
  %47 = flow.dispatch.region -> (tensor<1x144x131x131xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_124 : tensor<131x131x1x144xf32>) outs(%46 : tensor<1x144x131x131xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x144x131x131xf32>
    flow.return %237 : tensor<1x144x131x131xf32>
  }
  %48 = tensor.empty() : tensor<1x144x129x129xf32>
  %49 = linalg.fill ins(%cst_14 : f32) outs(%48 : tensor<1x144x129x129xf32>) -> tensor<1x144x129x129xf32>
  %50 = flow.dispatch.region -> (tensor<1x144x129x129xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%47, %cst_50 : tensor<1x144x131x131xf32>, tensor<144x3x3xf32>) outs(%49 : tensor<1x144x129x129xf32>) -> tensor<1x144x129x129xf32>
    flow.return %237 : tensor<1x144x129x129xf32>
  }
  %51 = tensor.empty() : tensor<129x129x1x144xf32>
  %52 = flow.dispatch.region -> (tensor<129x129x1x144xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50 : tensor<1x144x129x129xf32>) outs(%51 : tensor<129x129x1x144xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<129x129x1x144xf32>
    flow.return %237 : tensor<129x129x1x144xf32>
  }
  %collapsed_125 = tensor.collapse_shape %52 [[0], [1], [2, 3]] : tensor<129x129x1x144xf32> into tensor<129x129x144xf32>
  %expanded_126 = tensor.expand_shape %collapsed_125 [[0, 1], [2], [3]] output_shape [1, 129, 129, 144] : tensor<129x129x144xf32> into tensor<1x129x129x144xf32>
  %53 = flow.dispatch.region -> (tensor<1x129x129x144xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_126 : tensor<1x129x129x144xf32>) outs(%41 : tensor<1x129x129x144xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x129x129x144xf32>
    flow.return %237 : tensor<1x129x129x144xf32>
  }
  %54 = flow.dispatch.region -> (tensor<1x129x129x24xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%53, %cst_76 : tensor<1x129x129x144xf32>, tensor<1x1x144x24xf32>) outs(%37 : tensor<1x129x129x24xf32>) -> tensor<1x129x129x24xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_121, %237, %cst_51 : tensor<1x129x129x24xf32>, tensor<1x129x129x24xf32>, tensor<24xf32>) outs(%36 : tensor<1x129x129x24xf32>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: f32):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_9 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_9 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_12 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      %260 = arith.extsi %259 : i8 to i32
      %261 = arith.sitofp %260 : i32 to f32
      %262 = arith.mulf %261, %cst_12 : f32
      linalg.yield %262 : f32
    } -> tensor<1x129x129x24xf32>
    flow.return %238 : tensor<1x129x129x24xf32>
  }
  %55 = flow.dispatch.region -> (tensor<1x129x129x144xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%54, %cst_77 : tensor<1x129x129x24xf32>, tensor<1x1x24x144xf32>) outs(%42 : tensor<1x129x129x144xf32>) -> tensor<1x129x129x144xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_53 : tensor<1x129x129x144xf32>, tensor<144xf32>) outs(%41 : tensor<1x129x129x144xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x129x129x144xf32>
    flow.return %238 : tensor<1x129x129x144xf32>
  }
  %collapsed_127 = tensor.collapse_shape %55 [[0, 1], [2], [3]] : tensor<1x129x129x144xf32> into tensor<129x129x144xf32>
  %inserted_slice_128 = tensor.insert_slice %collapsed_127 into %45[1, 1, 0] [129, 129, 144] [1, 1, 1] : tensor<129x129x144xf32> into tensor<131x131x144xf32>
  %expanded_129 = tensor.expand_shape %inserted_slice_128 [[0], [1], [2, 3]] output_shape [131, 131, 1, 144] : tensor<131x131x144xf32> into tensor<131x131x1x144xf32>
  %56 = flow.dispatch.region -> (tensor<1x144x131x131xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_129 : tensor<131x131x1x144xf32>) outs(%46 : tensor<1x144x131x131xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x144x131x131xf32>
    flow.return %237 : tensor<1x144x131x131xf32>
  }
  %57 = tensor.empty() : tensor<1x144x65x65xf32>
  %58 = linalg.fill ins(%cst_14 : f32) outs(%57 : tensor<1x144x65x65xf32>) -> tensor<1x144x65x65xf32>
  %59 = flow.dispatch.region -> (tensor<1x144x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%56, %cst_52 : tensor<1x144x131x131xf32>, tensor<144x3x3xf32>) outs(%58 : tensor<1x144x65x65xf32>) -> tensor<1x144x65x65xf32>
    flow.return %237 : tensor<1x144x65x65xf32>
  }
  %60 = tensor.empty() : tensor<65x65x1x144xf32>
  %61 = flow.dispatch.region -> (tensor<65x65x1x144xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%59 : tensor<1x144x65x65xf32>) outs(%60 : tensor<65x65x1x144xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x144xf32>
    flow.return %237 : tensor<65x65x1x144xf32>
  }
  %collapsed_130 = tensor.collapse_shape %61 [[0], [1], [2, 3]] : tensor<65x65x1x144xf32> into tensor<65x65x144xf32>
  %expanded_131 = tensor.expand_shape %collapsed_130 [[0, 1], [2], [3]] output_shape [1, 65, 65, 144] : tensor<65x65x144xf32> into tensor<1x65x65x144xf32>
  %62 = tensor.empty() : tensor<1x65x65x144xf32>
  %63 = flow.dispatch.region -> (tensor<1x65x65x144xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_131 : tensor<1x65x65x144xf32>) outs(%62 : tensor<1x65x65x144xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x144xf32>
    flow.return %237 : tensor<1x65x65x144xf32>
  }
  %64 = tensor.empty() : tensor<65x65x32xf32>
  %65 = tensor.empty() : tensor<1x65x65x32xf32>
  %66 = linalg.fill ins(%cst_14 : f32) outs(%65 : tensor<1x65x65x32xf32>) -> tensor<1x65x65x32xf32>
  %67 = tensor.empty() : tensor<1x65x65x32xi8>
  %68 = flow.dispatch.region -> (tensor<1x65x65x32xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%63, %cst_78 : tensor<1x65x65x144xf32>, tensor<1x1x144x32xf32>) outs(%66 : tensor<1x65x65x32xf32>) -> tensor<1x65x65x32xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_54 : tensor<1x65x65x32xf32>, tensor<32xf32>) outs(%67 : tensor<1x65x65x32xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_9 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      linalg.yield %247 : i8
    } -> tensor<1x65x65x32xi8>
    flow.return %238 : tensor<1x65x65x32xi8>
  }
  %collapsed_132 = tensor.collapse_shape %68 [[0, 1], [2], [3]] : tensor<1x65x65x32xi8> into tensor<65x65x32xi8>
  %69 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_132 : tensor<65x65x32xi8>) outs(%64 : tensor<65x65x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_9 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x32xf32>
  %expanded_133 = tensor.expand_shape %69 [[0, 1], [2], [3]] output_shape [1, 65, 65, 32] : tensor<65x65x32xf32> into tensor<1x65x65x32xf32>
  %70 = tensor.empty() : tensor<1x65x65x192xf32>
  %71 = linalg.fill ins(%cst_14 : f32) outs(%70 : tensor<1x65x65x192xf32>) -> tensor<1x65x65x192xf32>
  %72 = flow.dispatch.region -> (tensor<1x65x65x192xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_133, %cst_79 : tensor<1x65x65x32xf32>, tensor<1x1x32x192xf32>) outs(%71 : tensor<1x65x65x192xf32>) -> tensor<1x65x65x192xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_22 : tensor<1x65x65x192xf32>, tensor<192xf32>) outs(%70 : tensor<1x65x65x192xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x192xf32>
    flow.return %238 : tensor<1x65x65x192xf32>
  }
  %collapsed_134 = tensor.collapse_shape %72 [[0, 1], [2], [3]] : tensor<1x65x65x192xf32> into tensor<65x65x192xf32>
  %73 = tensor.empty() : tensor<67x67x192xf32>
  %74 = linalg.fill ins(%cst_14 : f32) outs(%73 : tensor<67x67x192xf32>) -> tensor<67x67x192xf32>
  %inserted_slice_135 = tensor.insert_slice %collapsed_134 into %74[1, 1, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<67x67x192xf32>
  %expanded_136 = tensor.expand_shape %inserted_slice_135 [[0], [1], [2, 3]] output_shape [67, 67, 1, 192] : tensor<67x67x192xf32> into tensor<67x67x1x192xf32>
  %75 = tensor.empty() : tensor<1x192x67x67xf32>
  %76 = flow.dispatch.region -> (tensor<1x192x67x67xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_136 : tensor<67x67x1x192xf32>) outs(%75 : tensor<1x192x67x67xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x192x67x67xf32>
    flow.return %237 : tensor<1x192x67x67xf32>
  }
  %77 = tensor.empty() : tensor<1x192x65x65xf32>
  %78 = linalg.fill ins(%cst_14 : f32) outs(%77 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
  %79 = flow.dispatch.region -> (tensor<1x192x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%76, %cst_55 : tensor<1x192x67x67xf32>, tensor<192x3x3xf32>) outs(%78 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
    flow.return %237 : tensor<1x192x65x65xf32>
  }
  %80 = tensor.empty() : tensor<65x65x1x192xf32>
  %81 = flow.dispatch.region -> (tensor<65x65x1x192xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%79 : tensor<1x192x65x65xf32>) outs(%80 : tensor<65x65x1x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x192xf32>
    flow.return %237 : tensor<65x65x1x192xf32>
  }
  %collapsed_137 = tensor.collapse_shape %81 [[0], [1], [2, 3]] : tensor<65x65x1x192xf32> into tensor<65x65x192xf32>
  %expanded_138 = tensor.expand_shape %collapsed_137 [[0, 1], [2], [3]] output_shape [1, 65, 65, 192] : tensor<65x65x192xf32> into tensor<1x65x65x192xf32>
  %82 = flow.dispatch.region -> (tensor<1x65x65x192xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_138 : tensor<1x65x65x192xf32>) outs(%70 : tensor<1x65x65x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x192xf32>
    flow.return %237 : tensor<1x65x65x192xf32>
  }
  %83 = flow.dispatch.region -> (tensor<1x65x65x32xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%82, %cst_80 : tensor<1x65x65x192xf32>, tensor<1x1x192x32xf32>) outs(%66 : tensor<1x65x65x32xf32>) -> tensor<1x65x65x32xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_133, %237, %cst_56 : tensor<1x65x65x32xf32>, tensor<1x65x65x32xf32>, tensor<32xf32>) outs(%67 : tensor<1x65x65x32xi8>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: i8):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_9 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_9 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_9 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      linalg.yield %259 : i8
    } -> tensor<1x65x65x32xi8>
    flow.return %238 : tensor<1x65x65x32xi8>
  }
  %collapsed_139 = tensor.collapse_shape %83 [[0, 1], [2], [3]] : tensor<1x65x65x32xi8> into tensor<65x65x32xi8>
  %84 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_139 : tensor<65x65x32xi8>) outs(%64 : tensor<65x65x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_9 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x32xf32>
  %expanded_140 = tensor.expand_shape %84 [[0, 1], [2], [3]] output_shape [1, 65, 65, 32] : tensor<65x65x32xf32> into tensor<1x65x65x32xf32>
  %85 = flow.dispatch.region -> (tensor<1x65x65x192xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_140, %cst_81 : tensor<1x65x65x32xf32>, tensor<1x1x32x192xf32>) outs(%71 : tensor<1x65x65x192xf32>) -> tensor<1x65x65x192xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_58 : tensor<1x65x65x192xf32>, tensor<192xf32>) outs(%70 : tensor<1x65x65x192xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x192xf32>
    flow.return %238 : tensor<1x65x65x192xf32>
  }
  %collapsed_141 = tensor.collapse_shape %85 [[0, 1], [2], [3]] : tensor<1x65x65x192xf32> into tensor<65x65x192xf32>
  %inserted_slice_142 = tensor.insert_slice %collapsed_141 into %74[1, 1, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<67x67x192xf32>
  %expanded_143 = tensor.expand_shape %inserted_slice_142 [[0], [1], [2, 3]] output_shape [67, 67, 1, 192] : tensor<67x67x192xf32> into tensor<67x67x1x192xf32>
  %86 = flow.dispatch.region -> (tensor<1x192x67x67xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_143 : tensor<67x67x1x192xf32>) outs(%75 : tensor<1x192x67x67xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x192x67x67xf32>
    flow.return %237 : tensor<1x192x67x67xf32>
  }
  %87 = flow.dispatch.region -> (tensor<1x192x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%86, %cst_57 : tensor<1x192x67x67xf32>, tensor<192x3x3xf32>) outs(%78 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
    flow.return %237 : tensor<1x192x65x65xf32>
  }
  %88 = flow.dispatch.region -> (tensor<65x65x1x192xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%87 : tensor<1x192x65x65xf32>) outs(%80 : tensor<65x65x1x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x192xf32>
    flow.return %237 : tensor<65x65x1x192xf32>
  }
  %collapsed_144 = tensor.collapse_shape %88 [[0], [1], [2, 3]] : tensor<65x65x1x192xf32> into tensor<65x65x192xf32>
  %expanded_145 = tensor.expand_shape %collapsed_144 [[0, 1], [2], [3]] output_shape [1, 65, 65, 192] : tensor<65x65x192xf32> into tensor<1x65x65x192xf32>
  %89 = flow.dispatch.region -> (tensor<1x65x65x192xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_145 : tensor<1x65x65x192xf32>) outs(%70 : tensor<1x65x65x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x192xf32>
    flow.return %237 : tensor<1x65x65x192xf32>
  }
  %90 = flow.dispatch.region -> (tensor<1x65x65x32xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%89, %cst_82 : tensor<1x65x65x192xf32>, tensor<1x1x192x32xf32>) outs(%66 : tensor<1x65x65x32xf32>) -> tensor<1x65x65x32xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_140, %237, %cst_59 : tensor<1x65x65x32xf32>, tensor<1x65x65x32xf32>, tensor<32xf32>) outs(%65 : tensor<1x65x65x32xf32>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: f32):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_9 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_9 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_9 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      %260 = arith.extsi %259 : i8 to i32
      %261 = arith.sitofp %260 : i32 to f32
      %262 = arith.mulf %261, %cst_9 : f32
      linalg.yield %262 : f32
    } -> tensor<1x65x65x32xf32>
    flow.return %238 : tensor<1x65x65x32xf32>
  }
  %91 = flow.dispatch.region -> (tensor<1x65x65x192xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%90, %cst_83 : tensor<1x65x65x32xf32>, tensor<1x1x32x192xf32>) outs(%71 : tensor<1x65x65x192xf32>) -> tensor<1x65x65x192xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_61 : tensor<1x65x65x192xf32>, tensor<192xf32>) outs(%70 : tensor<1x65x65x192xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x192xf32>
    flow.return %238 : tensor<1x65x65x192xf32>
  }
  %collapsed_146 = tensor.collapse_shape %91 [[0, 1], [2], [3]] : tensor<1x65x65x192xf32> into tensor<65x65x192xf32>
  %inserted_slice_147 = tensor.insert_slice %collapsed_146 into %74[1, 1, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<67x67x192xf32>
  %expanded_148 = tensor.expand_shape %inserted_slice_147 [[0], [1], [2, 3]] output_shape [67, 67, 1, 192] : tensor<67x67x192xf32> into tensor<67x67x1x192xf32>
  %92 = flow.dispatch.region -> (tensor<1x192x67x67xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_148 : tensor<67x67x1x192xf32>) outs(%75 : tensor<1x192x67x67xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x192x67x67xf32>
    flow.return %237 : tensor<1x192x67x67xf32>
  }
  %93 = flow.dispatch.region -> (tensor<1x192x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%92, %cst_60 : tensor<1x192x67x67xf32>, tensor<192x3x3xf32>) outs(%78 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
    flow.return %237 : tensor<1x192x65x65xf32>
  }
  %94 = flow.dispatch.region -> (tensor<65x65x1x192xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%93 : tensor<1x192x65x65xf32>) outs(%80 : tensor<65x65x1x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x192xf32>
    flow.return %237 : tensor<65x65x1x192xf32>
  }
  %collapsed_149 = tensor.collapse_shape %94 [[0], [1], [2, 3]] : tensor<65x65x1x192xf32> into tensor<65x65x192xf32>
  %expanded_150 = tensor.expand_shape %collapsed_149 [[0, 1], [2], [3]] output_shape [1, 65, 65, 192] : tensor<65x65x192xf32> into tensor<1x65x65x192xf32>
  %95 = flow.dispatch.region -> (tensor<1x65x65x192xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_150 : tensor<1x65x65x192xf32>) outs(%70 : tensor<1x65x65x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x192xf32>
    flow.return %237 : tensor<1x65x65x192xf32>
  }
  %96 = tensor.empty() : tensor<65x65x64xf32>
  %97 = tensor.empty() : tensor<1x65x65x64xf32>
  %98 = linalg.fill ins(%cst_14 : f32) outs(%97 : tensor<1x65x65x64xf32>) -> tensor<1x65x65x64xf32>
  %99 = tensor.empty() : tensor<1x65x65x64xi8>
  %100 = flow.dispatch.region -> (tensor<1x65x65x64xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%95, %cst_84 : tensor<1x65x65x192xf32>, tensor<1x1x192x64xf32>) outs(%98 : tensor<1x65x65x64xf32>) -> tensor<1x65x65x64xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_62 : tensor<1x65x65x64xf32>, tensor<64xf32>) outs(%99 : tensor<1x65x65x64xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      linalg.yield %247 : i8
    } -> tensor<1x65x65x64xi8>
    flow.return %238 : tensor<1x65x65x64xi8>
  }
  %collapsed_151 = tensor.collapse_shape %100 [[0, 1], [2], [3]] : tensor<1x65x65x64xi8> into tensor<65x65x64xi8>
  %101 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_151 : tensor<65x65x64xi8>) outs(%96 : tensor<65x65x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_8 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x64xf32>
  %expanded_152 = tensor.expand_shape %101 [[0, 1], [2], [3]] output_shape [1, 65, 65, 64] : tensor<65x65x64xf32> into tensor<1x65x65x64xf32>
  %102 = tensor.empty() : tensor<1x65x65x384xf32>
  %103 = linalg.fill ins(%cst_14 : f32) outs(%102 : tensor<1x65x65x384xf32>) -> tensor<1x65x65x384xf32>
  %104 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_152, %cst_85 : tensor<1x65x65x64xf32>, tensor<1x1x64x384xf32>) outs(%103 : tensor<1x65x65x384xf32>) -> tensor<1x65x65x384xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_23 : tensor<1x65x65x384xf32>, tensor<384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %238 : tensor<1x65x65x384xf32>
  }
  %collapsed_153 = tensor.collapse_shape %104 [[0, 1], [2], [3]] : tensor<1x65x65x384xf32> into tensor<65x65x384xf32>
  %105 = tensor.empty() : tensor<69x69x384xf32>
  %106 = linalg.fill ins(%cst_14 : f32) outs(%105 : tensor<69x69x384xf32>) -> tensor<69x69x384xf32>
  %inserted_slice_154 = tensor.insert_slice %collapsed_153 into %106[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
  %expanded_155 = tensor.expand_shape %inserted_slice_154 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %107 = tensor.empty() : tensor<1x384x69x69xf32>
  %108 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_155 : tensor<69x69x1x384xf32>) outs(%107 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %237 : tensor<1x384x69x69xf32>
  }
  %109 = tensor.empty() : tensor<1x384x65x65xf32>
  %110 = linalg.fill ins(%cst_14 : f32) outs(%109 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
  %111 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%108, %cst_63 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%110 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %237 : tensor<1x384x65x65xf32>
  }
  %112 = tensor.empty() : tensor<65x65x1x384xf32>
  %113 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%111 : tensor<1x384x65x65xf32>) outs(%112 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %237 : tensor<65x65x1x384xf32>
  }
  %collapsed_156 = tensor.collapse_shape %113 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %expanded_157 = tensor.expand_shape %collapsed_156 [[0, 1], [2], [3]] output_shape [1, 65, 65, 384] : tensor<65x65x384xf32> into tensor<1x65x65x384xf32>
  %114 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_157 : tensor<1x65x65x384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %237 : tensor<1x65x65x384xf32>
  }
  %115 = flow.dispatch.region -> (tensor<1x65x65x64xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%114, %cst_86 : tensor<1x65x65x384xf32>, tensor<1x1x384x64xf32>) outs(%98 : tensor<1x65x65x64xf32>) -> tensor<1x65x65x64xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_152, %237, %cst_64 : tensor<1x65x65x64xf32>, tensor<1x65x65x64xf32>, tensor<64xf32>) outs(%99 : tensor<1x65x65x64xi8>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: i8):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_9 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      linalg.yield %259 : i8
    } -> tensor<1x65x65x64xi8>
    flow.return %238 : tensor<1x65x65x64xi8>
  }
  %collapsed_158 = tensor.collapse_shape %115 [[0, 1], [2], [3]] : tensor<1x65x65x64xi8> into tensor<65x65x64xi8>
  %116 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_158 : tensor<65x65x64xi8>) outs(%96 : tensor<65x65x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_9 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x64xf32>
  %expanded_159 = tensor.expand_shape %116 [[0, 1], [2], [3]] output_shape [1, 65, 65, 64] : tensor<65x65x64xf32> into tensor<1x65x65x64xf32>
  %117 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_159, %cst_87 : tensor<1x65x65x64xf32>, tensor<1x1x64x384xf32>) outs(%103 : tensor<1x65x65x384xf32>) -> tensor<1x65x65x384xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_66 : tensor<1x65x65x384xf32>, tensor<384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %238 : tensor<1x65x65x384xf32>
  }
  %collapsed_160 = tensor.collapse_shape %117 [[0, 1], [2], [3]] : tensor<1x65x65x384xf32> into tensor<65x65x384xf32>
  %inserted_slice_161 = tensor.insert_slice %collapsed_160 into %106[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
  %expanded_162 = tensor.expand_shape %inserted_slice_161 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %118 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_162 : tensor<69x69x1x384xf32>) outs(%107 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %237 : tensor<1x384x69x69xf32>
  }
  %119 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%118, %cst_65 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%110 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %237 : tensor<1x384x65x65xf32>
  }
  %120 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%119 : tensor<1x384x65x65xf32>) outs(%112 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %237 : tensor<65x65x1x384xf32>
  }
  %collapsed_163 = tensor.collapse_shape %120 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %expanded_164 = tensor.expand_shape %collapsed_163 [[0, 1], [2], [3]] output_shape [1, 65, 65, 384] : tensor<65x65x384xf32> into tensor<1x65x65x384xf32>
  %121 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_164 : tensor<1x65x65x384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %237 : tensor<1x65x65x384xf32>
  }
  %122 = flow.dispatch.region -> (tensor<1x65x65x64xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%121, %cst_88 : tensor<1x65x65x384xf32>, tensor<1x1x384x64xf32>) outs(%98 : tensor<1x65x65x64xf32>) -> tensor<1x65x65x64xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_159, %237, %cst_67 : tensor<1x65x65x64xf32>, tensor<1x65x65x64xf32>, tensor<64xf32>) outs(%99 : tensor<1x65x65x64xi8>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: i8):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_9 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      linalg.yield %259 : i8
    } -> tensor<1x65x65x64xi8>
    flow.return %238 : tensor<1x65x65x64xi8>
  }
  %collapsed_165 = tensor.collapse_shape %122 [[0, 1], [2], [3]] : tensor<1x65x65x64xi8> into tensor<65x65x64xi8>
  %123 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_165 : tensor<65x65x64xi8>) outs(%96 : tensor<65x65x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_9 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x64xf32>
  %expanded_166 = tensor.expand_shape %123 [[0, 1], [2], [3]] output_shape [1, 65, 65, 64] : tensor<65x65x64xf32> into tensor<1x65x65x64xf32>
  %124 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_166, %cst_89 : tensor<1x65x65x64xf32>, tensor<1x1x64x384xf32>) outs(%103 : tensor<1x65x65x384xf32>) -> tensor<1x65x65x384xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_69 : tensor<1x65x65x384xf32>, tensor<384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %238 : tensor<1x65x65x384xf32>
  }
  %collapsed_167 = tensor.collapse_shape %124 [[0, 1], [2], [3]] : tensor<1x65x65x384xf32> into tensor<65x65x384xf32>
  %inserted_slice_168 = tensor.insert_slice %collapsed_167 into %106[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
  %expanded_169 = tensor.expand_shape %inserted_slice_168 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %125 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_169 : tensor<69x69x1x384xf32>) outs(%107 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %237 : tensor<1x384x69x69xf32>
  }
  %126 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%125, %cst_68 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%110 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %237 : tensor<1x384x65x65xf32>
  }
  %127 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126 : tensor<1x384x65x65xf32>) outs(%112 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %237 : tensor<65x65x1x384xf32>
  }
  %collapsed_170 = tensor.collapse_shape %127 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %expanded_171 = tensor.expand_shape %collapsed_170 [[0, 1], [2], [3]] output_shape [1, 65, 65, 384] : tensor<65x65x384xf32> into tensor<1x65x65x384xf32>
  %128 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_171 : tensor<1x65x65x384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %237 : tensor<1x65x65x384xf32>
  }
  %129 = flow.dispatch.region -> (tensor<1x65x65x64xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%128, %cst_90 : tensor<1x65x65x384xf32>, tensor<1x1x384x64xf32>) outs(%98 : tensor<1x65x65x64xf32>) -> tensor<1x65x65x64xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_166, %237, %cst_70 : tensor<1x65x65x64xf32>, tensor<1x65x65x64xf32>, tensor<64xf32>) outs(%97 : tensor<1x65x65x64xf32>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: f32):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_9 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      %260 = arith.extsi %259 : i8 to i32
      %261 = arith.sitofp %260 : i32 to f32
      %262 = arith.mulf %261, %cst_9 : f32
      linalg.yield %262 : f32
    } -> tensor<1x65x65x64xf32>
    flow.return %238 : tensor<1x65x65x64xf32>
  }
  %130 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%129, %cst_91 : tensor<1x65x65x64xf32>, tensor<1x1x64x384xf32>) outs(%103 : tensor<1x65x65x384xf32>) -> tensor<1x65x65x384xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_32 : tensor<1x65x65x384xf32>, tensor<384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %238 : tensor<1x65x65x384xf32>
  }
  %collapsed_172 = tensor.collapse_shape %130 [[0, 1], [2], [3]] : tensor<1x65x65x384xf32> into tensor<65x65x384xf32>
  %inserted_slice_173 = tensor.insert_slice %collapsed_172 into %106[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
  %expanded_174 = tensor.expand_shape %inserted_slice_173 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %131 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_174 : tensor<69x69x1x384xf32>) outs(%107 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %237 : tensor<1x384x69x69xf32>
  }
  %132 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%131, %cst_31 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%110 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %237 : tensor<1x384x65x65xf32>
  }
  %133 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%132 : tensor<1x384x65x65xf32>) outs(%112 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %237 : tensor<65x65x1x384xf32>
  }
  %collapsed_175 = tensor.collapse_shape %133 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %expanded_176 = tensor.expand_shape %collapsed_175 [[0, 1], [2], [3]] output_shape [1, 65, 65, 384] : tensor<65x65x384xf32> into tensor<1x65x65x384xf32>
  %134 = flow.dispatch.region -> (tensor<1x65x65x384xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_176 : tensor<1x65x65x384xf32>) outs(%102 : tensor<1x65x65x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x384xf32>
    flow.return %237 : tensor<1x65x65x384xf32>
  }
  %135 = tensor.empty() : tensor<65x65x96xf32>
  %136 = tensor.empty() : tensor<1x65x65x96xf32>
  %137 = linalg.fill ins(%cst_14 : f32) outs(%136 : tensor<1x65x65x96xf32>) -> tensor<1x65x65x96xf32>
  %138 = tensor.empty() : tensor<1x65x65x96xi8>
  %139 = flow.dispatch.region -> (tensor<1x65x65x96xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%134, %cst_92 : tensor<1x65x65x384xf32>, tensor<1x1x384x96xf32>) outs(%137 : tensor<1x65x65x96xf32>) -> tensor<1x65x65x96xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_33 : tensor<1x65x65x96xf32>, tensor<96xf32>) outs(%138 : tensor<1x65x65x96xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      linalg.yield %247 : i8
    } -> tensor<1x65x65x96xi8>
    flow.return %238 : tensor<1x65x65x96xi8>
  }
  %collapsed_177 = tensor.collapse_shape %139 [[0, 1], [2], [3]] : tensor<1x65x65x96xi8> into tensor<65x65x96xi8>
  %140 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_177 : tensor<65x65x96xi8>) outs(%135 : tensor<65x65x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_8 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x96xf32>
  %expanded_178 = tensor.expand_shape %140 [[0, 1], [2], [3]] output_shape [1, 65, 65, 96] : tensor<65x65x96xf32> into tensor<1x65x65x96xf32>
  %141 = tensor.empty() : tensor<1x65x65x576xf32>
  %142 = linalg.fill ins(%cst_14 : f32) outs(%141 : tensor<1x65x65x576xf32>) -> tensor<1x65x65x576xf32>
  %143 = flow.dispatch.region -> (tensor<1x65x65x576xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_178, %cst_93 : tensor<1x65x65x96xf32>, tensor<1x1x96x576xf32>) outs(%142 : tensor<1x65x65x576xf32>) -> tensor<1x65x65x576xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_19 : tensor<1x65x65x576xf32>, tensor<576xf32>) outs(%141 : tensor<1x65x65x576xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x576xf32>
    flow.return %238 : tensor<1x65x65x576xf32>
  }
  %collapsed_179 = tensor.collapse_shape %143 [[0, 1], [2], [3]] : tensor<1x65x65x576xf32> into tensor<65x65x576xf32>
  %144 = tensor.empty() : tensor<69x69x576xf32>
  %145 = linalg.fill ins(%cst_14 : f32) outs(%144 : tensor<69x69x576xf32>) -> tensor<69x69x576xf32>
  %inserted_slice_180 = tensor.insert_slice %collapsed_179 into %145[2, 2, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<69x69x576xf32>
  %expanded_181 = tensor.expand_shape %inserted_slice_180 [[0], [1], [2, 3]] output_shape [69, 69, 1, 576] : tensor<69x69x576xf32> into tensor<69x69x1x576xf32>
  %146 = tensor.empty() : tensor<1x576x69x69xf32>
  %147 = flow.dispatch.region -> (tensor<1x576x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_181 : tensor<69x69x1x576xf32>) outs(%146 : tensor<1x576x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x576x69x69xf32>
    flow.return %237 : tensor<1x576x69x69xf32>
  }
  %148 = tensor.empty() : tensor<1x576x65x65xf32>
  %149 = linalg.fill ins(%cst_14 : f32) outs(%148 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
  %150 = flow.dispatch.region -> (tensor<1x576x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%147, %cst_34 : tensor<1x576x69x69xf32>, tensor<576x3x3xf32>) outs(%149 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
    flow.return %237 : tensor<1x576x65x65xf32>
  }
  %151 = tensor.empty() : tensor<65x65x1x576xf32>
  %152 = flow.dispatch.region -> (tensor<65x65x1x576xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%150 : tensor<1x576x65x65xf32>) outs(%151 : tensor<65x65x1x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x576xf32>
    flow.return %237 : tensor<65x65x1x576xf32>
  }
  %collapsed_182 = tensor.collapse_shape %152 [[0], [1], [2, 3]] : tensor<65x65x1x576xf32> into tensor<65x65x576xf32>
  %expanded_183 = tensor.expand_shape %collapsed_182 [[0, 1], [2], [3]] output_shape [1, 65, 65, 576] : tensor<65x65x576xf32> into tensor<1x65x65x576xf32>
  %153 = flow.dispatch.region -> (tensor<1x65x65x576xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_183 : tensor<1x65x65x576xf32>) outs(%141 : tensor<1x65x65x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x576xf32>
    flow.return %237 : tensor<1x65x65x576xf32>
  }
  %154 = flow.dispatch.region -> (tensor<1x65x65x96xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%153, %cst_94 : tensor<1x65x65x576xf32>, tensor<1x1x576x96xf32>) outs(%137 : tensor<1x65x65x96xf32>) -> tensor<1x65x65x96xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_178, %237, %cst_35 : tensor<1x65x65x96xf32>, tensor<1x65x65x96xf32>, tensor<96xf32>) outs(%138 : tensor<1x65x65x96xi8>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: i8):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_8 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      linalg.yield %259 : i8
    } -> tensor<1x65x65x96xi8>
    flow.return %238 : tensor<1x65x65x96xi8>
  }
  %collapsed_184 = tensor.collapse_shape %154 [[0, 1], [2], [3]] : tensor<1x65x65x96xi8> into tensor<65x65x96xi8>
  %155 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_184 : tensor<65x65x96xi8>) outs(%135 : tensor<65x65x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_8 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x96xf32>
  %expanded_185 = tensor.expand_shape %155 [[0, 1], [2], [3]] output_shape [1, 65, 65, 96] : tensor<65x65x96xf32> into tensor<1x65x65x96xf32>
  %156 = flow.dispatch.region -> (tensor<1x65x65x576xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_185, %cst_95 : tensor<1x65x65x96xf32>, tensor<1x1x96x576xf32>) outs(%142 : tensor<1x65x65x576xf32>) -> tensor<1x65x65x576xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_37 : tensor<1x65x65x576xf32>, tensor<576xf32>) outs(%141 : tensor<1x65x65x576xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x576xf32>
    flow.return %238 : tensor<1x65x65x576xf32>
  }
  %collapsed_186 = tensor.collapse_shape %156 [[0, 1], [2], [3]] : tensor<1x65x65x576xf32> into tensor<65x65x576xf32>
  %inserted_slice_187 = tensor.insert_slice %collapsed_186 into %145[2, 2, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<69x69x576xf32>
  %expanded_188 = tensor.expand_shape %inserted_slice_187 [[0], [1], [2, 3]] output_shape [69, 69, 1, 576] : tensor<69x69x576xf32> into tensor<69x69x1x576xf32>
  %157 = flow.dispatch.region -> (tensor<1x576x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_188 : tensor<69x69x1x576xf32>) outs(%146 : tensor<1x576x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x576x69x69xf32>
    flow.return %237 : tensor<1x576x69x69xf32>
  }
  %158 = flow.dispatch.region -> (tensor<1x576x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%157, %cst_36 : tensor<1x576x69x69xf32>, tensor<576x3x3xf32>) outs(%149 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
    flow.return %237 : tensor<1x576x65x65xf32>
  }
  %159 = flow.dispatch.region -> (tensor<65x65x1x576xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%158 : tensor<1x576x65x65xf32>) outs(%151 : tensor<65x65x1x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x576xf32>
    flow.return %237 : tensor<65x65x1x576xf32>
  }
  %collapsed_189 = tensor.collapse_shape %159 [[0], [1], [2, 3]] : tensor<65x65x1x576xf32> into tensor<65x65x576xf32>
  %expanded_190 = tensor.expand_shape %collapsed_189 [[0, 1], [2], [3]] output_shape [1, 65, 65, 576] : tensor<65x65x576xf32> into tensor<1x65x65x576xf32>
  %160 = flow.dispatch.region -> (tensor<1x65x65x576xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_190 : tensor<1x65x65x576xf32>) outs(%141 : tensor<1x65x65x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x576xf32>
    flow.return %237 : tensor<1x65x65x576xf32>
  }
  %161 = flow.dispatch.region -> (tensor<1x65x65x96xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%160, %cst_96 : tensor<1x65x65x576xf32>, tensor<1x1x576x96xf32>) outs(%137 : tensor<1x65x65x96xf32>) -> tensor<1x65x65x96xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_185, %237, %cst_38 : tensor<1x65x65x96xf32>, tensor<1x65x65x96xf32>, tensor<96xf32>) outs(%136 : tensor<1x65x65x96xf32>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: f32):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_8 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      %260 = arith.extsi %259 : i8 to i32
      %261 = arith.sitofp %260 : i32 to f32
      %262 = arith.mulf %261, %cst_8 : f32
      linalg.yield %262 : f32
    } -> tensor<1x65x65x96xf32>
    flow.return %238 : tensor<1x65x65x96xf32>
  }
  %162 = flow.dispatch.region -> (tensor<1x65x65x576xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%161, %cst_97 : tensor<1x65x65x96xf32>, tensor<1x1x96x576xf32>) outs(%142 : tensor<1x65x65x576xf32>) -> tensor<1x65x65x576xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_40 : tensor<1x65x65x576xf32>, tensor<576xf32>) outs(%141 : tensor<1x65x65x576xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x576xf32>
    flow.return %238 : tensor<1x65x65x576xf32>
  }
  %collapsed_191 = tensor.collapse_shape %162 [[0, 1], [2], [3]] : tensor<1x65x65x576xf32> into tensor<65x65x576xf32>
  %inserted_slice_192 = tensor.insert_slice %collapsed_191 into %145[2, 2, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<69x69x576xf32>
  %expanded_193 = tensor.expand_shape %inserted_slice_192 [[0], [1], [2, 3]] output_shape [69, 69, 1, 576] : tensor<69x69x576xf32> into tensor<69x69x1x576xf32>
  %163 = flow.dispatch.region -> (tensor<1x576x69x69xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_193 : tensor<69x69x1x576xf32>) outs(%146 : tensor<1x576x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x576x69x69xf32>
    flow.return %237 : tensor<1x576x69x69xf32>
  }
  %164 = flow.dispatch.region -> (tensor<1x576x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%163, %cst_39 : tensor<1x576x69x69xf32>, tensor<576x3x3xf32>) outs(%149 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
    flow.return %237 : tensor<1x576x65x65xf32>
  }
  %165 = flow.dispatch.region -> (tensor<65x65x1x576xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%164 : tensor<1x576x65x65xf32>) outs(%151 : tensor<65x65x1x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x576xf32>
    flow.return %237 : tensor<65x65x1x576xf32>
  }
  %collapsed_194 = tensor.collapse_shape %165 [[0], [1], [2, 3]] : tensor<65x65x1x576xf32> into tensor<65x65x576xf32>
  %expanded_195 = tensor.expand_shape %collapsed_194 [[0, 1], [2], [3]] output_shape [1, 65, 65, 576] : tensor<65x65x576xf32> into tensor<1x65x65x576xf32>
  %166 = flow.dispatch.region -> (tensor<1x65x65x576xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_195 : tensor<1x65x65x576xf32>) outs(%141 : tensor<1x65x65x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x576xf32>
    flow.return %237 : tensor<1x65x65x576xf32>
  }
  %167 = tensor.empty() : tensor<65x65x160xf32>
  %168 = tensor.empty() : tensor<1x65x65x160xf32>
  %169 = linalg.fill ins(%cst_14 : f32) outs(%168 : tensor<1x65x65x160xf32>) -> tensor<1x65x65x160xf32>
  %170 = tensor.empty() : tensor<1x65x65x160xi8>
  %171 = flow.dispatch.region -> (tensor<1x65x65x160xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%166, %cst_98 : tensor<1x65x65x576xf32>, tensor<1x1x576x160xf32>) outs(%169 : tensor<1x65x65x160xf32>) -> tensor<1x65x65x160xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_41 : tensor<1x65x65x160xf32>, tensor<160xf32>) outs(%170 : tensor<1x65x65x160xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      linalg.yield %247 : i8
    } -> tensor<1x65x65x160xi8>
    flow.return %238 : tensor<1x65x65x160xi8>
  }
  %collapsed_196 = tensor.collapse_shape %171 [[0, 1], [2], [3]] : tensor<1x65x65x160xi8> into tensor<65x65x160xi8>
  %172 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_196 : tensor<65x65x160xi8>) outs(%167 : tensor<65x65x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_8 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x160xf32>
  %expanded_197 = tensor.expand_shape %172 [[0, 1], [2], [3]] output_shape [1, 65, 65, 160] : tensor<65x65x160xf32> into tensor<1x65x65x160xf32>
  %173 = tensor.empty() : tensor<1x65x65x960xf32>
  %174 = linalg.fill ins(%cst_14 : f32) outs(%173 : tensor<1x65x65x960xf32>) -> tensor<1x65x65x960xf32>
  %175 = flow.dispatch.region -> (tensor<1x65x65x960xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_197, %cst_99 : tensor<1x65x65x160xf32>, tensor<1x1x160x960xf32>) outs(%174 : tensor<1x65x65x960xf32>) -> tensor<1x65x65x960xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_20 : tensor<1x65x65x960xf32>, tensor<960xf32>) outs(%173 : tensor<1x65x65x960xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x960xf32>
    flow.return %238 : tensor<1x65x65x960xf32>
  }
  %collapsed_198 = tensor.collapse_shape %175 [[0, 1], [2], [3]] : tensor<1x65x65x960xf32> into tensor<65x65x960xf32>
  %176 = tensor.empty() : tensor<73x73x960xf32>
  %177 = linalg.fill ins(%cst_14 : f32) outs(%176 : tensor<73x73x960xf32>) -> tensor<73x73x960xf32>
  %inserted_slice_199 = tensor.insert_slice %collapsed_198 into %177[4, 4, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<73x73x960xf32>
  %expanded_200 = tensor.expand_shape %inserted_slice_199 [[0], [1], [2, 3]] output_shape [73, 73, 1, 960] : tensor<73x73x960xf32> into tensor<73x73x1x960xf32>
  %178 = tensor.empty() : tensor<1x960x73x73xf32>
  %179 = flow.dispatch.region -> (tensor<1x960x73x73xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_200 : tensor<73x73x1x960xf32>) outs(%178 : tensor<1x960x73x73xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x960x73x73xf32>
    flow.return %237 : tensor<1x960x73x73xf32>
  }
  %180 = tensor.empty() : tensor<1x960x65x65xf32>
  %181 = linalg.fill ins(%cst_14 : f32) outs(%180 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
  %182 = flow.dispatch.region -> (tensor<1x960x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<4> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%179, %cst_42 : tensor<1x960x73x73xf32>, tensor<960x3x3xf32>) outs(%181 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
    flow.return %237 : tensor<1x960x65x65xf32>
  }
  %183 = tensor.empty() : tensor<65x65x1x960xf32>
  %184 = flow.dispatch.region -> (tensor<65x65x1x960xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%182 : tensor<1x960x65x65xf32>) outs(%183 : tensor<65x65x1x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x960xf32>
    flow.return %237 : tensor<65x65x1x960xf32>
  }
  %collapsed_201 = tensor.collapse_shape %184 [[0], [1], [2, 3]] : tensor<65x65x1x960xf32> into tensor<65x65x960xf32>
  %expanded_202 = tensor.expand_shape %collapsed_201 [[0, 1], [2], [3]] output_shape [1, 65, 65, 960] : tensor<65x65x960xf32> into tensor<1x65x65x960xf32>
  %185 = flow.dispatch.region -> (tensor<1x65x65x960xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_202 : tensor<1x65x65x960xf32>) outs(%173 : tensor<1x65x65x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x960xf32>
    flow.return %237 : tensor<1x65x65x960xf32>
  }
  %186 = flow.dispatch.region -> (tensor<1x65x65x160xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%185, %cst_100 : tensor<1x65x65x960xf32>, tensor<1x1x960x160xf32>) outs(%169 : tensor<1x65x65x160xf32>) -> tensor<1x65x65x160xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_197, %237, %cst_43 : tensor<1x65x65x160xf32>, tensor<1x65x65x160xf32>, tensor<160xf32>) outs(%170 : tensor<1x65x65x160xi8>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: i8):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_10 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_10 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_8 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      linalg.yield %259 : i8
    } -> tensor<1x65x65x160xi8>
    flow.return %238 : tensor<1x65x65x160xi8>
  }
  %collapsed_203 = tensor.collapse_shape %186 [[0, 1], [2], [3]] : tensor<1x65x65x160xi8> into tensor<65x65x160xi8>
  %187 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_203 : tensor<65x65x160xi8>) outs(%167 : tensor<65x65x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_8 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x160xf32>
  %expanded_204 = tensor.expand_shape %187 [[0, 1], [2], [3]] output_shape [1, 65, 65, 160] : tensor<65x65x160xf32> into tensor<1x65x65x160xf32>
  %188 = flow.dispatch.region -> (tensor<1x65x65x960xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_204, %cst_101 : tensor<1x65x65x160xf32>, tensor<1x1x160x960xf32>) outs(%174 : tensor<1x65x65x960xf32>) -> tensor<1x65x65x960xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_45 : tensor<1x65x65x960xf32>, tensor<960xf32>) outs(%173 : tensor<1x65x65x960xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x960xf32>
    flow.return %238 : tensor<1x65x65x960xf32>
  }
  %collapsed_205 = tensor.collapse_shape %188 [[0, 1], [2], [3]] : tensor<1x65x65x960xf32> into tensor<65x65x960xf32>
  %inserted_slice_206 = tensor.insert_slice %collapsed_205 into %177[4, 4, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<73x73x960xf32>
  %expanded_207 = tensor.expand_shape %inserted_slice_206 [[0], [1], [2, 3]] output_shape [73, 73, 1, 960] : tensor<73x73x960xf32> into tensor<73x73x1x960xf32>
  %189 = flow.dispatch.region -> (tensor<1x960x73x73xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_207 : tensor<73x73x1x960xf32>) outs(%178 : tensor<1x960x73x73xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x960x73x73xf32>
    flow.return %237 : tensor<1x960x73x73xf32>
  }
  %190 = flow.dispatch.region -> (tensor<1x960x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<4> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%189, %cst_44 : tensor<1x960x73x73xf32>, tensor<960x3x3xf32>) outs(%181 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
    flow.return %237 : tensor<1x960x65x65xf32>
  }
  %191 = flow.dispatch.region -> (tensor<65x65x1x960xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%190 : tensor<1x960x65x65xf32>) outs(%183 : tensor<65x65x1x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x960xf32>
    flow.return %237 : tensor<65x65x1x960xf32>
  }
  %collapsed_208 = tensor.collapse_shape %191 [[0], [1], [2, 3]] : tensor<65x65x1x960xf32> into tensor<65x65x960xf32>
  %expanded_209 = tensor.expand_shape %collapsed_208 [[0, 1], [2], [3]] output_shape [1, 65, 65, 960] : tensor<65x65x960xf32> into tensor<1x65x65x960xf32>
  %192 = flow.dispatch.region -> (tensor<1x65x65x960xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_209 : tensor<1x65x65x960xf32>) outs(%173 : tensor<1x65x65x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x960xf32>
    flow.return %237 : tensor<1x65x65x960xf32>
  }
  %193 = flow.dispatch.region -> (tensor<1x65x65x160xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%192, %cst_102 : tensor<1x65x65x960xf32>, tensor<1x1x960x160xf32>) outs(%169 : tensor<1x65x65x160xf32>) -> tensor<1x65x65x160xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_204, %237, %cst_46 : tensor<1x65x65x160xf32>, tensor<1x65x65x160xf32>, tensor<160xf32>) outs(%168 : tensor<1x65x65x160xf32>) {
    ^bb0(%in: f32, %in_229: f32, %in_230: f32, %out: f32):
      %239 = arith.addf %in_229, %in_230 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      %251 = arith.addf %in, %250 : f32
      %252 = arith.divf %251, %cst_8 : f32
      %253 = math.round %252 : f32
      %254 = arith.addf %253, %cst_14 : f32
      %255 = arith.cmpf ult, %254, %cst_16 : f32
      %256 = arith.cmpf ugt, %254, %cst_15 : f32
      %257 = arith.select %255, %cst_16, %254 : f32
      %258 = arith.select %256, %cst_15, %257 : f32
      %259 = arith.fptosi %258 : f32 to i8
      %260 = arith.extsi %259 : i8 to i32
      %261 = arith.sitofp %260 : i32 to f32
      %262 = arith.mulf %261, %cst_8 : f32
      linalg.yield %262 : f32
    } -> tensor<1x65x65x160xf32>
    flow.return %238 : tensor<1x65x65x160xf32>
  }
  %194 = flow.dispatch.region -> (tensor<1x65x65x960xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%193, %cst_103 : tensor<1x65x65x160xf32>, tensor<1x1x160x960xf32>) outs(%174 : tensor<1x65x65x960xf32>) -> tensor<1x65x65x960xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_48 : tensor<1x65x65x960xf32>, tensor<960xf32>) outs(%173 : tensor<1x65x65x960xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ult, %239, %cst_14 : f32
      %241 = arith.select %240, %cst_14, %239 : f32
      %242 = arith.cmpf ugt, %241, %cst_2 : f32
      %243 = arith.select %242, %cst_2, %241 : f32
      %244 = arith.divf %243, %cst_10 : f32
      %245 = math.round %244 : f32
      %246 = arith.addf %245, %cst_14 : f32
      %247 = arith.cmpf ult, %246, %cst_16 : f32
      %248 = arith.cmpf ugt, %246, %cst_15 : f32
      %249 = arith.select %247, %cst_16, %246 : f32
      %250 = arith.select %248, %cst_15, %249 : f32
      %251 = arith.fptosi %250 : f32 to i8
      %252 = arith.extsi %251 : i8 to i32
      %253 = arith.sitofp %252 : i32 to f32
      %254 = arith.mulf %253, %cst_10 : f32
      linalg.yield %254 : f32
    } -> tensor<1x65x65x960xf32>
    flow.return %238 : tensor<1x65x65x960xf32>
  }
  %collapsed_210 = tensor.collapse_shape %194 [[0, 1], [2], [3]] : tensor<1x65x65x960xf32> into tensor<65x65x960xf32>
  %inserted_slice_211 = tensor.insert_slice %collapsed_210 into %177[4, 4, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<73x73x960xf32>
  %expanded_212 = tensor.expand_shape %inserted_slice_211 [[0], [1], [2, 3]] output_shape [73, 73, 1, 960] : tensor<73x73x960xf32> into tensor<73x73x1x960xf32>
  %195 = flow.dispatch.region -> (tensor<1x960x73x73xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_212 : tensor<73x73x1x960xf32>) outs(%178 : tensor<1x960x73x73xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x960x73x73xf32>
    flow.return %237 : tensor<1x960x73x73xf32>
  }
  %196 = flow.dispatch.region -> (tensor<1x960x65x65xf32>) {
    %237 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<4> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%195, %cst_47 : tensor<1x960x73x73xf32>, tensor<960x3x3xf32>) outs(%181 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
    flow.return %237 : tensor<1x960x65x65xf32>
  }
  %197 = flow.dispatch.region -> (tensor<65x65x1x960xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%196 : tensor<1x960x65x65xf32>) outs(%183 : tensor<65x65x1x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.cmpf ult, %in, %cst_14 : f32
      %239 = arith.select %238, %cst_14, %in : f32
      %240 = arith.cmpf ugt, %239, %cst_2 : f32
      %241 = arith.select %240, %cst_2, %239 : f32
      linalg.yield %241 : f32
    } -> tensor<65x65x1x960xf32>
    flow.return %237 : tensor<65x65x1x960xf32>
  }
  %collapsed_213 = tensor.collapse_shape %197 [[0], [1], [2, 3]] : tensor<65x65x1x960xf32> into tensor<65x65x960xf32>
  %expanded_214 = tensor.expand_shape %collapsed_213 [[0, 1], [2], [3]] output_shape [1, 65, 65, 960] : tensor<65x65x960xf32> into tensor<1x65x65x960xf32>
  %198 = flow.dispatch.region -> (tensor<1x65x65x960xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_214 : tensor<1x65x65x960xf32>) outs(%173 : tensor<1x65x65x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_10 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_10 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x960xf32>
    flow.return %237 : tensor<1x65x65x960xf32>
  }
  %199 = tensor.empty() : tensor<65x65x320xf32>
  %200 = tensor.empty() : tensor<1x65x65x320xf32>
  %201 = linalg.fill ins(%cst_14 : f32) outs(%200 : tensor<1x65x65x320xf32>) -> tensor<1x65x65x320xf32>
  %202 = tensor.empty() : tensor<1x65x65x320xi8>
  %203 = flow.dispatch.region -> (tensor<1x65x65x320xi8>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%198, %cst_104 : tensor<1x65x65x960xf32>, tensor<1x1x960x320xf32>) outs(%201 : tensor<1x65x65x320xf32>) -> tensor<1x65x65x320xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_49 : tensor<1x65x65x320xf32>, tensor<320xf32>) outs(%202 : tensor<1x65x65x320xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      linalg.yield %247 : i8
    } -> tensor<1x65x65x320xi8>
    flow.return %238 : tensor<1x65x65x320xi8>
  }
  %collapsed_215 = tensor.collapse_shape %203 [[0, 1], [2], [3]] : tensor<1x65x65x320xi8> into tensor<65x65x320xi8>
  %204 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%collapsed_215 : tensor<65x65x320xi8>) outs(%199 : tensor<65x65x320xf32>) {
  ^bb0(%in: i8, %out: f32):
    %237 = arith.extsi %in : i8 to i32
    %238 = arith.sitofp %237 : i32 to f32
    %239 = arith.mulf %238, %cst_8 : f32
    linalg.yield %239 : f32
  } -> tensor<65x65x320xf32>
  %expanded_216 = tensor.expand_shape %204 [[0, 1], [2], [3]] output_shape [1, 65, 65, 320] : tensor<65x65x320xf32> into tensor<1x65x65x320xf32>
  %205 = tensor.empty() : tensor<320x65x65xf32>
  %206 = flow.dispatch.region -> (tensor<320x65x65xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%204 : tensor<65x65x320xf32>) outs(%205 : tensor<320x65x65xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<320x65x65xf32>
    flow.return %237 : tensor<320x65x65xf32>
  }
  %expanded_217 = tensor.expand_shape %206 [[0, 1], [2], [3]] output_shape [1, 320, 65, 65] : tensor<320x65x65xf32> into tensor<1x320x65x65xf32>
  %207 = tensor.empty() : tensor<1x320x1x1xf32>
  %208 = linalg.fill ins(%cst_14 : f32) outs(%207 : tensor<1x320x1x1xf32>) -> tensor<1x320x1x1xf32>
  %209 = tensor.empty() : tensor<65x65xf32>
  %210 = flow.dispatch.region -> (tensor<1x320x1x1xf32>) {
    %237 = linalg.pooling_nchw_sum {dilations = dense<1> : vector<2xi64>, strides = dense<65> : vector<2xi64>} ins(%expanded_217, %209 : tensor<1x320x65x65xf32>, tensor<65x65xf32>) outs(%208 : tensor<1x320x1x1xf32>) -> tensor<1x320x1x1xf32>
    flow.return %237 : tensor<1x320x1x1xf32>
  }
  %collapsed_218 = tensor.collapse_shape %210 [[0, 1, 2, 3]] : tensor<1x320x1x1xf32> into tensor<320xf32>
  %211 = tensor.empty() : tensor<1x65x65x256xf32>
  %212 = linalg.fill ins(%cst_14 : f32) outs(%211 : tensor<1x65x65x256xf32>) -> tensor<1x65x65x256xf32>
  %expanded_219 = tensor.expand_shape %collapsed_218 [[0, 1, 2, 3]] output_shape [1, 1, 1, 320] : tensor<320xf32> into tensor<1x1x1x320xf32>
  %213 = tensor.empty() : tensor<1x1x1x320xf32>
  %214 = flow.dispatch.region -> (tensor<1x1x1x320xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_219 : tensor<1x1x1x320xf32>) outs(%213 : tensor<1x1x1x320xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_17 : f32
      %239 = arith.mulf %238, %cst_1 : f32
      %240 = arith.divf %239, %cst_8 : f32
      %241 = math.round %240 : f32
      %242 = arith.addf %241, %cst_14 : f32
      %243 = arith.cmpf ult, %242, %cst_16 : f32
      %244 = arith.cmpf ugt, %242, %cst_15 : f32
      %245 = arith.select %243, %cst_16, %242 : f32
      %246 = arith.select %244, %cst_15, %245 : f32
      %247 = arith.fptosi %246 : f32 to i8
      %248 = arith.extsi %247 : i8 to i32
      %249 = arith.sitofp %248 : i32 to f32
      %250 = arith.mulf %249, %cst_8 : f32
      linalg.yield %250 : f32
    } -> tensor<1x1x1x320xf32>
    flow.return %237 : tensor<1x1x1x320xf32>
  }
  %215 = tensor.empty() : tensor<256x65x65xf32>
  %216 = tensor.empty() : tensor<256x1x65x65xf32>
  %217 = flow.dispatch.region -> (tensor<256x1x65x65xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_216, %cst_105 : tensor<1x65x65x320xf32>, tensor<1x1x320x256xf32>) outs(%212 : tensor<1x65x65x256xf32>) -> tensor<1x65x65x256xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_24 : tensor<1x65x65x256xf32>, tensor<256xf32>) outs(%216 : tensor<256x1x65x65xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ugt, %239, %cst_14 : f32
      %241 = arith.select %240, %239, %cst_14 : f32
      %242 = arith.divf %241, %cst_11 : f32
      %243 = math.round %242 : f32
      %244 = arith.addf %243, %cst_14 : f32
      %245 = arith.cmpf ult, %244, %cst_16 : f32
      %246 = arith.cmpf ugt, %244, %cst_15 : f32
      %247 = arith.select %245, %cst_16, %244 : f32
      %248 = arith.select %246, %cst_15, %247 : f32
      %249 = arith.fptosi %248 : f32 to i8
      %250 = arith.extsi %249 : i8 to i32
      %251 = arith.sitofp %250 : i32 to f32
      %252 = arith.mulf %251, %cst_11 : f32
      linalg.yield %252 : f32
    } -> tensor<256x1x65x65xf32>
    flow.return %238 : tensor<256x1x65x65xf32>
  }
  %collapsed_220 = tensor.collapse_shape %217 [[0], [1, 2], [3]] : tensor<256x1x65x65xf32> into tensor<256x65x65xf32>
  %218 = tensor.empty() : tensor<1x1x1x256xf32>
  %219 = linalg.fill ins(%cst_14 : f32) outs(%218 : tensor<1x1x1x256xf32>) -> tensor<1x1x1x256xf32>
  %220 = flow.dispatch.region -> (tensor<1x1x1x256xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%214, %cst_106 : tensor<1x1x1x320xf32>, tensor<1x1x320x256xf32>) outs(%219 : tensor<1x1x1x256xf32>) -> tensor<1x1x1x256xf32>
    flow.return %237 : tensor<1x1x1x256xf32>
  }
  %collapsed_221 = tensor.collapse_shape %220 [[0, 1, 2, 3]] : tensor<1x1x1x256xf32> into tensor<256xf32>
  %expanded_222 = tensor.expand_shape %collapsed_221 [[0, 1, 2, 3]] output_shape [1, 256, 1, 1] : tensor<256xf32> into tensor<1x256x1x1xf32>
  %221 = tensor.empty() : tensor<1x256x1x1xf32>
  %222 = flow.dispatch.region -> (tensor<1x256x1x1xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_222, %cst_0 : tensor<1x256x1x1xf32>, tensor<1x256x1x1xf32>) outs(%221 : tensor<1x256x1x1xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %238 = arith.addf %in, %in_229 : f32
      %239 = arith.cmpf ugt, %238, %cst_14 : f32
      %240 = arith.select %239, %238, %cst_14 : f32
      %241 = arith.divf %240, %cst_11 : f32
      %242 = math.round %241 : f32
      %243 = arith.addf %242, %cst_14 : f32
      %244 = arith.cmpf ult, %243, %cst_16 : f32
      %245 = arith.cmpf ugt, %243, %cst_15 : f32
      %246 = arith.select %244, %cst_16, %243 : f32
      %247 = arith.select %245, %cst_15, %246 : f32
      %248 = arith.fptosi %247 : f32 to i8
      %249 = arith.extsi %248 : i8 to i32
      %250 = arith.sitofp %249 : i32 to f32
      %251 = arith.mulf %250, %cst_11 : f32
      linalg.yield %251 : f32
    } -> tensor<1x256x1x1xf32>
    flow.return %237 : tensor<1x256x1x1xf32>
  }
  %223 = flow.dispatch.region -> (tensor<256x65x65xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%215 : tensor<256x65x65xf32>) {
    ^bb0(%out: f32):
      %238 = linalg.index 0 : index
      %239 = linalg.index 1 : index
      %240 = linalg.index 2 : index
      %241 = arith.index_cast %239 : index to i64
      %242 = arith.sitofp %241 : i64 to f32
      %243 = arith.addf %242, %cst_12 : f32
      %244 = arith.divf %243, %cst_6 : f32
      %245 = arith.subf %244, %cst_12 : f32
      %246 = arith.maximumf %245, %cst_14 : f32
      %247 = arith.minimumf %246, %cst_4 : f32
      %248 = arith.minimumf %246, %cst_14 : f32
      %249 = math.floor %247 : f32
      %250 = arith.addf %247, %cst_13 : f32
      %251 = math.floor %250 : f32
      %252 = arith.fptosi %249 : f32 to i64
      %253 = arith.index_cast %252 : i64 to index
      %254 = arith.fptosi %251 : f32 to i64
      %255 = arith.index_cast %254 : i64 to index
      %256 = arith.index_cast %240 : index to i64
      %257 = arith.sitofp %256 : i64 to f32
      %258 = arith.addf %257, %cst_12 : f32
      %259 = arith.divf %258, %cst_6 : f32
      %260 = arith.subf %259, %cst_12 : f32
      %261 = arith.maximumf %260, %cst_14 : f32
      %262 = arith.minimumf %261, %cst_4 : f32
      %263 = arith.minimumf %261, %cst_14 : f32
      %264 = math.floor %262 : f32
      %265 = arith.addf %262, %cst_13 : f32
      %266 = math.floor %265 : f32
      %267 = arith.fptosi %264 : f32 to i64
      %268 = arith.index_cast %267 : i64 to index
      %269 = arith.fptosi %266 : f32 to i64
      %270 = arith.index_cast %269 : i64 to index
      %extracted = tensor.extract %222[%c0, %238, %253, %268] : tensor<1x256x1x1xf32>
      %extracted_229 = tensor.extract %222[%c0, %238, %253, %270] : tensor<1x256x1x1xf32>
      %extracted_230 = tensor.extract %222[%c0, %238, %255, %268] : tensor<1x256x1x1xf32>
      %extracted_231 = tensor.extract %222[%c0, %238, %255, %270] : tensor<1x256x1x1xf32>
      %271 = arith.subf %251, %248 : f32
      %272 = arith.subf %248, %249 : f32
      %273 = arith.subf %266, %263 : f32
      %274 = arith.subf %263, %264 : f32
      %275 = arith.mulf %273, %extracted : f32
      %276 = arith.mulf %274, %extracted_229 : f32
      %277 = arith.addf %275, %276 : f32
      %278 = arith.mulf %271, %277 : f32
      %279 = arith.mulf %273, %extracted_230 : f32
      %280 = arith.mulf %274, %extracted_231 : f32
      %281 = arith.addf %279, %280 : f32
      %282 = arith.mulf %272, %281 : f32
      %283 = arith.addf %278, %282 : f32
      %284 = arith.divf %283, %cst_11 : f32
      %285 = math.round %284 : f32
      %286 = arith.addf %285, %cst_14 : f32
      %287 = arith.cmpf ult, %286, %cst_16 : f32
      %288 = arith.cmpf ugt, %286, %cst_15 : f32
      %289 = arith.select %287, %cst_16, %286 : f32
      %290 = arith.select %288, %cst_15, %289 : f32
      %291 = arith.fptosi %290 : f32 to i8
      %292 = arith.extsi %291 : i8 to i32
      %293 = arith.sitofp %292 : i32 to f32
      %294 = arith.mulf %293, %cst_11 : f32
      linalg.yield %294 : f32
    } -> tensor<256x65x65xf32>
    flow.return %237 : tensor<256x65x65xf32>
  }
  %224 = tensor.empty() : tensor<1x512x65x65xf32>
  %inserted_slice_223 = tensor.insert_slice %223 into %224[0, 0, 0, 0] [1, 256, 65, 65] [1, 1, 1, 1] : tensor<256x65x65xf32> into tensor<1x512x65x65xf32>
  %inserted_slice_224 = tensor.insert_slice %collapsed_220 into %inserted_slice_223[0, 256, 0, 0] [1, 256, 65, 65] [1, 1, 1, 1] : tensor<256x65x65xf32> into tensor<1x512x65x65xf32>
  %collapsed_225 = tensor.collapse_shape %inserted_slice_224 [[0, 1], [2], [3]] : tensor<1x512x65x65xf32> into tensor<512x65x65xf32>
  %expanded_226 = tensor.expand_shape %collapsed_225 [[0], [1, 2], [3]] output_shape [512, 1, 65, 65] : tensor<512x65x65xf32> into tensor<512x1x65x65xf32>
  %225 = tensor.empty() : tensor<1x65x65x512xf32>
  %226 = flow.dispatch.region -> (tensor<1x65x65x512xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_226 : tensor<512x1x65x65xf32>) outs(%225 : tensor<1x65x65x512xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.divf %in, %cst_11 : f32
      %239 = math.round %238 : f32
      %240 = arith.addf %239, %cst_14 : f32
      %241 = arith.cmpf ult, %240, %cst_16 : f32
      %242 = arith.cmpf ugt, %240, %cst_15 : f32
      %243 = arith.select %241, %cst_16, %240 : f32
      %244 = arith.select %242, %cst_15, %243 : f32
      %245 = arith.fptosi %244 : f32 to i8
      %246 = arith.extsi %245 : i8 to i32
      %247 = arith.sitofp %246 : i32 to f32
      %248 = arith.mulf %247, %cst_11 : f32
      linalg.yield %248 : f32
    } -> tensor<1x65x65x512xf32>
    flow.return %237 : tensor<1x65x65x512xf32>
  }
  %227 = flow.dispatch.region -> (tensor<1x65x65x256xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%226, %cst_107 : tensor<1x65x65x512xf32>, tensor<1x1x512x256xf32>) outs(%212 : tensor<1x65x65x256xf32>) -> tensor<1x65x65x256xf32>
    %238 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%237, %cst_71 : tensor<1x65x65x256xf32>, tensor<256xf32>) outs(%211 : tensor<1x65x65x256xf32>) {
    ^bb0(%in: f32, %in_229: f32, %out: f32):
      %239 = arith.addf %in, %in_229 : f32
      %240 = arith.cmpf ugt, %239, %cst_14 : f32
      %241 = arith.select %240, %239, %cst_14 : f32
      %242 = arith.divf %241, %cst_10 : f32
      %243 = math.round %242 : f32
      %244 = arith.addf %243, %cst_14 : f32
      %245 = arith.cmpf ult, %244, %cst_16 : f32
      %246 = arith.cmpf ugt, %244, %cst_15 : f32
      %247 = arith.select %245, %cst_16, %244 : f32
      %248 = arith.select %246, %cst_15, %247 : f32
      %249 = arith.fptosi %248 : f32 to i8
      %250 = arith.extsi %249 : i8 to i32
      %251 = arith.sitofp %250 : i32 to f32
      %252 = arith.mulf %251, %cst_10 : f32
      linalg.yield %252 : f32
    } -> tensor<1x65x65x256xf32>
    flow.return %238 : tensor<1x65x65x256xf32>
  }
  %228 = tensor.empty() : tensor<1x65x65x21xf32>
  %229 = linalg.fill ins(%cst_14 : f32) outs(%228 : tensor<1x65x65x21xf32>) -> tensor<1x65x65x21xf32>
  %230 = flow.dispatch.region -> (tensor<1x65x65x21xf32>) {
    %237 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%227, %cst_108 : tensor<1x65x65x256xf32>, tensor<1x1x256x21xf32>) outs(%229 : tensor<1x65x65x21xf32>) -> tensor<1x65x65x21xf32>
    flow.return %237 : tensor<1x65x65x21xf32>
  }
  %collapsed_227 = tensor.collapse_shape %230 [[0, 1], [2], [3]] : tensor<1x65x65x21xf32> into tensor<65x65x21xf32>
  %expanded_228 = tensor.expand_shape %collapsed_227 [[0], [1], [2, 3]] output_shape [65, 65, 1, 21] : tensor<65x65x21xf32> into tensor<65x65x1x21xf32>
  %231 = tensor.empty() : tensor<1x21x65x65xi8>
  %232 = flow.dispatch.region -> (tensor<1x21x65x65xi8>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_228, %cst : tensor<65x65x1x21xf32>, tensor<1x21xf32>) outs(%231 : tensor<1x21x65x65xi8>) {
    ^bb0(%in: f32, %in_229: f32, %out: i8):
      %238 = arith.addf %in, %in_229 : f32
      %239 = arith.divf %238, %cst_9 : f32
      %240 = math.round %239 : f32
      %241 = arith.addf %240, %cst_14 : f32
      %242 = arith.cmpf ult, %241, %cst_16 : f32
      %243 = arith.cmpf ugt, %241, %cst_15 : f32
      %244 = arith.select %242, %cst_16, %241 : f32
      %245 = arith.select %243, %cst_15, %244 : f32
      %246 = arith.fptosi %245 : f32 to i8
      linalg.yield %246 : i8
    } -> tensor<1x21x65x65xi8>
    flow.return %237 : tensor<1x21x65x65xi8>
  }
  %233 = tensor.empty() : tensor<1x513x513x21xf32>
  %234 = flow.dispatch.region -> (tensor<1x513x513x21xf32>) {
    %237 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%233 : tensor<1x513x513x21xf32>) {
    ^bb0(%out: f32):
      %238 = linalg.index 0 : index
      %239 = linalg.index 1 : index
      %240 = linalg.index 2 : index
      %241 = linalg.index 3 : index
      %242 = affine.apply affine_map<(d0, d1) -> (d0 + d1 * 513)>(%239, %238)
      %243 = arith.index_cast %242 : index to i64
      %244 = arith.sitofp %243 : i64 to f32
      %245 = arith.addf %244, %cst_12 : f32
      %246 = arith.divf %245, %cst_5 : f32
      %247 = arith.subf %246, %cst_12 : f32
      %248 = arith.maximumf %247, %cst_14 : f32
      %249 = arith.minimumf %248, %cst_3 : f32
      %250 = math.floor %249 : f32
      %251 = arith.addf %249, %cst_13 : f32
      %252 = math.floor %251 : f32
      %253 = arith.fptosi %250 : f32 to i64
      %254 = arith.index_cast %253 : i64 to index
      %255 = arith.fptosi %252 : f32 to i64
      %256 = arith.index_cast %255 : i64 to index
      %257 = arith.index_cast %240 : index to i64
      %258 = arith.sitofp %257 : i64 to f32
      %259 = arith.addf %258, %cst_12 : f32
      %260 = arith.divf %259, %cst_5 : f32
      %261 = arith.subf %260, %cst_12 : f32
      %262 = arith.maximumf %261, %cst_14 : f32
      %263 = arith.minimumf %262, %cst_3 : f32
      %264 = math.floor %263 : f32
      %265 = arith.addf %263, %cst_13 : f32
      %266 = math.floor %265 : f32
      %267 = arith.fptosi %264 : f32 to i64
      %268 = arith.index_cast %267 : i64 to index
      %269 = arith.fptosi %266 : f32 to i64
      %270 = arith.index_cast %269 : i64 to index
      %extracted = tensor.extract %232[%c0, %241, %254, %268] : tensor<1x21x65x65xi8>
      %271 = arith.extsi %extracted : i8 to i32
      %272 = arith.sitofp %271 : i32 to f32
      %273 = arith.mulf %272, %cst_9 : f32
      %extracted_229 = tensor.extract %232[%c0, %241, %254, %270] : tensor<1x21x65x65xi8>
      %274 = arith.extsi %extracted_229 : i8 to i32
      %275 = arith.sitofp %274 : i32 to f32
      %276 = arith.mulf %275, %cst_9 : f32
      %extracted_230 = tensor.extract %232[%c0, %241, %256, %268] : tensor<1x21x65x65xi8>
      %277 = arith.extsi %extracted_230 : i8 to i32
      %278 = arith.sitofp %277 : i32 to f32
      %279 = arith.mulf %278, %cst_9 : f32
      %extracted_231 = tensor.extract %232[%c0, %241, %256, %270] : tensor<1x21x65x65xi8>
      %280 = arith.extsi %extracted_231 : i8 to i32
      %281 = arith.sitofp %280 : i32 to f32
      %282 = arith.mulf %281, %cst_9 : f32
      %283 = arith.subf %252, %249 : f32
      %284 = arith.subf %249, %250 : f32
      %285 = arith.subf %266, %263 : f32
      %286 = arith.subf %263, %264 : f32
      %287 = arith.mulf %285, %273 : f32
      %288 = arith.mulf %286, %276 : f32
      %289 = arith.addf %287, %288 : f32
      %290 = arith.mulf %283, %289 : f32
      %291 = arith.mulf %285, %279 : f32
      %292 = arith.mulf %286, %282 : f32
      %293 = arith.addf %291, %292 : f32
      %294 = arith.mulf %284, %293 : f32
      %295 = arith.addf %290, %294 : f32
      %296 = arith.divf %295, %cst_9 : f32
      %297 = math.round %296 : f32
      %298 = arith.addf %297, %cst_14 : f32
      %299 = arith.cmpf ult, %298, %cst_16 : f32
      %300 = arith.cmpf ugt, %298, %cst_15 : f32
      %301 = arith.select %299, %cst_16, %298 : f32
      %302 = arith.select %300, %cst_15, %301 : f32
      %303 = arith.fptosi %302 : f32 to i8
      %304 = arith.extsi %303 : i8 to i32
      %305 = arith.sitofp %304 : i32 to f32
      %306 = arith.mulf %305, %cst_9 : f32
      %307 = arith.divf %306, %cst_9 : f32
      %308 = math.round %307 : f32
      %309 = arith.addf %308, %cst_14 : f32
      %310 = arith.cmpf ult, %309, %cst_16 : f32
      %311 = arith.cmpf ugt, %309, %cst_15 : f32
      %312 = arith.select %310, %cst_16, %309 : f32
      %313 = arith.select %311, %cst_15, %312 : f32
      %314 = arith.fptosi %313 : f32 to i8
      %315 = arith.extsi %314 : i8 to i32
      %316 = arith.sitofp %315 : i32 to f32
      %317 = arith.mulf %316, %cst_9 : f32
      linalg.yield %317 : f32
    } -> tensor<1x513x513x21xf32>
    flow.return %237 : tensor<1x513x513x21xf32>
  }
  %235 = hal.tensor.barrier join(%234 : tensor<1x513x513x21xf32>) => %arg2 : !hal.fence
  %236 = hal.tensor.export %235 : tensor<1x513x513x21xf32> -> !hal.buffer_view
  util.return %236 : !hal.buffer_view
}