yzhang93/gist:d0b09b559800f74314eb2d95c0aa2b7d Secret

## gistfile1.txt
// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_0(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_0(%input0: tensor<576xi8>) -> (%output0: tensor<576xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576xi8>
  %1 = tensor.empty() : tensor<576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<576xi8>) outs(%1 : tensor<576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_5(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_5(%input0: tensor<256xi8>) -> (%output0: tensor<256xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256xi8>
  %1 = tensor.empty() : tensor<256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<256xi8>) outs(%1 : tensor<256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_1(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_1(%input0: tensor<960xi8>) -> (%output0: tensor<960xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960xi8>
  %1 = tensor.empty() : tensor<960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<960xi8>) outs(%1 : tensor<960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_9(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_9(%input0: tensor<16xi8>) -> (%output0: tensor<16xf32>)"}} {
  %cst = arith.constant 2.500000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16xi8>
  %1 = tensor.empty() : tensor<16xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<16xi8>) outs(%1 : tensor<16xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<16xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<16xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_2(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_2(%input0: tensor<144xi8>) -> (%output0: tensor<144xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144xi8>
  %1 = tensor.empty() : tensor<144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<144xi8>) outs(%1 : tensor<144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_10(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_10(%input0: tensor<96x3x3xi8>) -> (%output0: tensor<96x3x3xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x3x3xi8>
  %1 = tensor.empty() : tensor<96x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<96x3x3xi8>) outs(%1 : tensor<96x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_6(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_6(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_8(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_8(%input0: tensor<32x3x3xi8>) -> (%output0: tensor<32x3x3xf32>)"}} {
  %cst = arith.constant 5.000000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x3x3xi8>
  %1 = tensor.empty() : tensor<32x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<32x3x3xi8>) outs(%1 : tensor<32x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_4(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_4(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_3(%input0: tensor<192xi8>) -> (%output0: tensor<192xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192xi8>
  %1 = tensor.empty() : tensor<192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<192xi8>) outs(%1 : tensor<192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_7(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_7(%input0: tensor<32x3x3x3xi8>) -> (%output0: tensor<3x3x3x32xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x3x3x3xi8>
  %1 = tensor.empty() : tensor<3x3x3x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d2, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x3x3x3xi8>) outs(%1 : tensor<3x3x3x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<3x3x3x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<3x3x3x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_12(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_12(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_13(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_13(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_14(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_14(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_15(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_15(%input0: tensor<576x3x3xi8>) -> (%output0: tensor<576x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x3x3xi8>
  %1 = tensor.empty() : tensor<576x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<576x3x3xi8>) outs(%1 : tensor<576x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_16(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_16(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_11(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_11(%input0: tensor<24xi8>) -> (%output0: tensor<24xf32>)"}} {
  %cst = arith.constant 5.000000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24xi8>
  %1 = tensor.empty() : tensor<24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<24xi8>) outs(%1 : tensor<24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_20(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_20(%input0: tensor<576x3x3xi8>) -> (%output0: tensor<576x3x3xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x3x3xi8>
  %1 = tensor.empty() : tensor<576x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<576x3x3xi8>) outs(%1 : tensor<576x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_24(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_24(%input0: tensor<160xi8>) -> (%output0: tensor<160xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160xi8>
  %1 = tensor.empty() : tensor<160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<160xi8>) outs(%1 : tensor<160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_17(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_17(%input0: tensor<576x3x3xi8>) -> (%output0: tensor<576x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x3x3xi8>
  %1 = tensor.empty() : tensor<576x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<576x3x3xi8>) outs(%1 : tensor<576x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_26(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_26(%input0: tensor<960xi8>) -> (%output0: tensor<960xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960xi8>
  %1 = tensor.empty() : tensor<960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<960xi8>) outs(%1 : tensor<960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_25(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_25(%input0: tensor<960x3x3xi8>) -> (%output0: tensor<960x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x3x3xi8>
  %1 = tensor.empty() : tensor<960x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<960x3x3xi8>) outs(%1 : tensor<960x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_27(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_27(%input0: tensor<160xi8>) -> (%output0: tensor<160xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160xi8>
  %1 = tensor.empty() : tensor<160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<160xi8>) outs(%1 : tensor<160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_29(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_29(%input0: tensor<960xi8>) -> (%output0: tensor<960xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960xi8>
  %1 = tensor.empty() : tensor<960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<960xi8>) outs(%1 : tensor<960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_23(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_23(%input0: tensor<960x3x3xi8>) -> (%output0: tensor<960x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x3x3xi8>
  %1 = tensor.empty() : tensor<960x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<960x3x3xi8>) outs(%1 : tensor<960x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_18(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_18(%input0: tensor<576xi8>) -> (%output0: tensor<576xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576xi8>
  %1 = tensor.empty() : tensor<576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<576xi8>) outs(%1 : tensor<576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_19(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_19(%input0: tensor<96xi8>) -> (%output0: tensor<96xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96xi8>
  %1 = tensor.empty() : tensor<96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<96xi8>) outs(%1 : tensor<96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_22(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_22(%input0: tensor<160xi8>) -> (%output0: tensor<160xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160xi8>
  %1 = tensor.empty() : tensor<160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<160xi8>) outs(%1 : tensor<160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_21(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_21(%input0: tensor<576xi8>) -> (%output0: tensor<576xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576xi8>
  %1 = tensor.empty() : tensor<576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<576xi8>) outs(%1 : tensor<576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_30(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_30(%input0: tensor<320xi8>) -> (%output0: tensor<320xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<320xi8>
  %1 = tensor.empty() : tensor<320xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<320xi8>) outs(%1 : tensor<320xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<320xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<320xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_32(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_32(%input0: tensor<24xi8>) -> (%output0: tensor<24xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24xi8>
  %1 = tensor.empty() : tensor<24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<24xi8>) outs(%1 : tensor<24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_28(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_28(%input0: tensor<960x3x3xi8>) -> (%output0: tensor<960x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x3x3xi8>
  %1 = tensor.empty() : tensor<960x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<960x3x3xi8>) outs(%1 : tensor<960x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<960x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<960x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_35(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_35(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 2.500000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_33(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_33(%input0: tensor<144x3x3xi8>) -> (%output0: tensor<144x3x3xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x3x3xi8>
  %1 = tensor.empty() : tensor<144x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<144x3x3xi8>) outs(%1 : tensor<144x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_34(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_34(%input0: tensor<144xi8>) -> (%output0: tensor<144xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144xi8>
  %1 = tensor.empty() : tensor<144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<144xi8>) outs(%1 : tensor<144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_37(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_37(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_31(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_31(%input0: tensor<144x3x3xi8>) -> (%output0: tensor<144x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x3x3xi8>
  %1 = tensor.empty() : tensor<144x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<144x3x3xi8>) outs(%1 : tensor<144x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<144x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<144x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_42(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_42(%input0: tensor<192xi8>) -> (%output0: tensor<192xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192xi8>
  %1 = tensor.empty() : tensor<192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<192xi8>) outs(%1 : tensor<192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_40(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_40(%input0: tensor<32xi8>) -> (%output0: tensor<32xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xi8>
  %1 = tensor.empty() : tensor<32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xi8>) outs(%1 : tensor<32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_36(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_36(%input0: tensor<192x3x3xi8>) -> (%output0: tensor<192x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x3x3xi8>
  %1 = tensor.empty() : tensor<192x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<192x3x3xi8>) outs(%1 : tensor<192x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_45(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_45(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_47(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_47(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_48(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_48(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_43(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_43(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 2.500000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_41(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_41(%input0: tensor<192x3x3xi8>) -> (%output0: tensor<192x3x3xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x3x3xi8>
  %1 = tensor.empty() : tensor<192x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<192x3x3xi8>) outs(%1 : tensor<192x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_46(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_46(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_49(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_49(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_39(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_39(%input0: tensor<192xi8>) -> (%output0: tensor<192xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192xi8>
  %1 = tensor.empty() : tensor<192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<192xi8>) outs(%1 : tensor<192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_44(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_44(%input0: tensor<384x3x3xi8>) -> (%output0: tensor<384x3x3xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x3x3xi8>
  %1 = tensor.empty() : tensor<384x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<384x3x3xi8>) outs(%1 : tensor<384x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_38(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_38(%input0: tensor<192x3x3xi8>) -> (%output0: tensor<192x3x3xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x3x3xi8>
  %1 = tensor.empty() : tensor<192x3x3xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<192x3x3xi8>) outs(%1 : tensor<192x3x3xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<192x3x3xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<192x3x3xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_50(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_50(%input0: tensor<384xi8>) -> (%output0: tensor<384xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384xi8>
  %1 = tensor.empty() : tensor<384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<384xi8>) outs(%1 : tensor<384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_51(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_51(%input0: tensor<64xi8>) -> (%output0: tensor<64xf32>)"}} {
  %cst = arith.constant 1.250000e-01 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64xi8>
  %1 = tensor.empty() : tensor<64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<64xi8>) outs(%1 : tensor<64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_55(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_55(%input0: tensor<16x1x1x32xi8>) -> (%output0: tensor<1x1x32x16xf32>)"}} {
  %cst = arith.constant 6.250000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x16xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<16x1x1x32xi8>) outs(%1 : tensor<1x1x32x16xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x16xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x16xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_52(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_52(%input0: tensor<256xi8>) -> (%output0: tensor<256xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256xi8>
  %1 = tensor.empty() : tensor<256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<256xi8>) outs(%1 : tensor<256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_54(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_54(%input0: tensor<21xi8>) -> (%output0: tensor<21xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<21xi8>
  %1 = tensor.empty() : tensor<21xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<21xi8>) outs(%1 : tensor<21xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<21xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<21xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_56(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_56(%input0: tensor<96x1x1x16xi8>) -> (%output0: tensor<1x1x16x96xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x16xi8>
  %1 = tensor.empty() : tensor<1x1x16x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x16xi8>) outs(%1 : tensor<1x1x16x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x16x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x16x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_61(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_61(%input0: tensor<32x1x1x144xi8>) -> (%output0: tensor<1x1x144x32xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x1x1x144xi8>
  %1 = tensor.empty() : tensor<1x1x144x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x1x1x144xi8>) outs(%1 : tensor<1x1x144x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x144x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x144x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_53(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_53(%input0: tensor<256xi8>) -> (%output0: tensor<256xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256xi8>
  %1 = tensor.empty() : tensor<256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<256xi8>) outs(%1 : tensor<256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_64(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_64(%input0: tensor<192x1x1x32xi8>) -> (%output0: tensor<1x1x32x192xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<192x1x1x32xi8>) outs(%1 : tensor<1x1x32x192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_65(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_65(%input0: tensor<32x1x1x192xi8>) -> (%output0: tensor<1x1x192x32xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x1x1x192xi8>
  %1 = tensor.empty() : tensor<1x1x192x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x1x1x192xi8>) outs(%1 : tensor<1x1x192x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x192x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x192x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_66(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_66(%input0: tensor<192x1x1x32xi8>) -> (%output0: tensor<1x1x32x192xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<192x1x1x32xi8>) outs(%1 : tensor<1x1x32x192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_58(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_58(%input0: tensor<144x1x1x24xi8>) -> (%output0: tensor<1x1x24x144xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x1x1x24xi8>
  %1 = tensor.empty() : tensor<1x1x24x144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<144x1x1x24xi8>) outs(%1 : tensor<1x1x24x144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x24x144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x24x144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_67(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_67(%input0: tensor<64x1x1x192xi8>) -> (%output0: tensor<1x1x192x64xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x192xi8>
  %1 = tensor.empty() : tensor<1x1x192x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x192xi8>) outs(%1 : tensor<1x1x192x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x192x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x192x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_62(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_62(%input0: tensor<192x1x1x32xi8>) -> (%output0: tensor<1x1x32x192xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<192x1x1x32xi8>
  %1 = tensor.empty() : tensor<1x1x32x192xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<192x1x1x32xi8>) outs(%1 : tensor<1x1x32x192xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x32x192xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x32x192xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_63(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_63(%input0: tensor<32x1x1x192xi8>) -> (%output0: tensor<1x1x192x32xf32>)"}} {
  %cst = arith.constant 1.562500e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x1x1x192xi8>
  %1 = tensor.empty() : tensor<1x1x192x32xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<32x1x1x192xi8>) outs(%1 : tensor<1x1x192x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x192x32xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x192x32xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_57(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_57(%input0: tensor<24x1x1x96xi8>) -> (%output0: tensor<1x1x96x24xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<24x1x1x96xi8>) outs(%1 : tensor<1x1x96x24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_60(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_60(%input0: tensor<144x1x1x24xi8>) -> (%output0: tensor<1x1x24x144xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<144x1x1x24xi8>
  %1 = tensor.empty() : tensor<1x1x24x144xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<144x1x1x24xi8>) outs(%1 : tensor<1x1x24x144xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x24x144xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x24x144xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_59(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_59(%input0: tensor<24x1x1x144xi8>) -> (%output0: tensor<1x1x144x24xf32>)"}} {
  %cst = arith.constant 3.125000e-02 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<24x1x1x144xi8>
  %1 = tensor.empty() : tensor<1x1x144x24xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<24x1x1x144xi8>) outs(%1 : tensor<1x1x144x24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x144x24xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x144x24xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_71(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_71(%input0: tensor<64x1x1x384xi8>) -> (%output0: tensor<1x1x384x64xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x384xi8>) outs(%1 : tensor<1x1x384x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_68(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_68(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_70(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_70(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_72(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_72(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_73(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_73(%input0: tensor<64x1x1x384xi8>) -> (%output0: tensor<1x1x384x64xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x384xi8>) outs(%1 : tensor<1x1x384x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_69(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_69(%input0: tensor<64x1x1x384xi8>) -> (%output0: tensor<1x1x384x64xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<64x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x64xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<64x1x1x384xi8>) outs(%1 : tensor<1x1x384x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x64xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x64xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_76(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_76(%input0: tensor<576x1x1x96xi8>) -> (%output0: tensor<1x1x96x576xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<576x1x1x96xi8>) outs(%1 : tensor<1x1x96x576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_79(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_79(%input0: tensor<96x1x1x576xi8>) -> (%output0: tensor<1x1x576x96xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x576xi8>
  %1 = tensor.empty() : tensor<1x1x576x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x576xi8>) outs(%1 : tensor<1x1x576x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x576x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x576x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_82(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_82(%input0: tensor<960x1x1x160xi8>) -> (%output0: tensor<1x1x160x960xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x1x1x160xi8>
  %1 = tensor.empty() : tensor<1x1x160x960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<960x1x1x160xi8>) outs(%1 : tensor<1x1x160x960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x160x960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x160x960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_74(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_74(%input0: tensor<384x1x1x64xi8>) -> (%output0: tensor<1x1x64x384xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<384x1x1x64xi8>
  %1 = tensor.empty() : tensor<1x1x64x384xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<384x1x1x64xi8>) outs(%1 : tensor<1x1x64x384xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x64x384xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x64x384xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_75(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_75(%input0: tensor<96x1x1x384xi8>) -> (%output0: tensor<1x1x384x96xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x384xi8>
  %1 = tensor.empty() : tensor<1x1x384x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x384xi8>) outs(%1 : tensor<1x1x384x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x384x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x384x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_85(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_85(%input0: tensor<160x1x1x960xi8>) -> (%output0: tensor<1x1x960x160xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160x1x1x960xi8>
  %1 = tensor.empty() : tensor<1x1x960x160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<160x1x1x960xi8>) outs(%1 : tensor<1x1x960x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x960x160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x960x160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_78(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_78(%input0: tensor<576x1x1x96xi8>) -> (%output0: tensor<1x1x96x576xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<576x1x1x96xi8>) outs(%1 : tensor<1x1x96x576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_77(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_77(%input0: tensor<96x1x1x576xi8>) -> (%output0: tensor<1x1x576x96xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<96x1x1x576xi8>
  %1 = tensor.empty() : tensor<1x1x576x96xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<96x1x1x576xi8>) outs(%1 : tensor<1x1x576x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x576x96xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x576x96xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_87(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_87(%input0: tensor<320x1x1x960xi8>) -> (%output0: tensor<1x1x960x320xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<320x1x1x960xi8>
  %1 = tensor.empty() : tensor<1x1x960x320xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<320x1x1x960xi8>) outs(%1 : tensor<1x1x960x320xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x960x320xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x960x320xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_88(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_88(%input0: tensor<256x1x1x320xi8>) -> (%output0: tensor<1x1x320x256xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256x1x1x320xi8>
  %1 = tensor.empty() : tensor<1x1x320x256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<256x1x1x320xi8>) outs(%1 : tensor<1x1x320x256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x320x256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x320x256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_83(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_83(%input0: tensor<160x1x1x960xi8>) -> (%output0: tensor<1x1x960x160xf32>)"}} {
  %cst = arith.constant 7.812500e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160x1x1x960xi8>
  %1 = tensor.empty() : tensor<1x1x960x160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<160x1x1x960xi8>) outs(%1 : tensor<1x1x960x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x960x160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x960x160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_84(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_84(%input0: tensor<960x1x1x160xi8>) -> (%output0: tensor<1x1x160x960xf32>)"}} {
  %cst = arith.constant 9.765625E-4 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x1x1x160xi8>
  %1 = tensor.empty() : tensor<1x1x160x960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<960x1x1x160xi8>) outs(%1 : tensor<1x1x160x960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x160x960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x160x960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_81(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_81(%input0: tensor<160x1x1x576xi8>) -> (%output0: tensor<1x1x576x160xf32>)"}} {
  %cst = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<160x1x1x576xi8>
  %1 = tensor.empty() : tensor<1x1x576x160xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<160x1x1x576xi8>) outs(%1 : tensor<1x1x576x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x576x160xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x576x160xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_80(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_80(%input0: tensor<576x1x1x96xi8>) -> (%output0: tensor<1x1x96x576xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<576x1x1x96xi8>
  %1 = tensor.empty() : tensor<1x1x96x576xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<576x1x1x96xi8>) outs(%1 : tensor<1x1x96x576xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x96x576xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x96x576xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_86(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_86(%input0: tensor<960x1x1x160xi8>) -> (%output0: tensor<1x1x160x960xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<960x1x1x160xi8>
  %1 = tensor.empty() : tensor<1x1x160x960xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<960x1x1x160xi8>) outs(%1 : tensor<1x1x160x960xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x160x960xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x160x960xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_89(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_89(%input0: tensor<256x1x1x320xi8>) -> (%output0: tensor<1x1x320x256xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256x1x1x320xi8>
  %1 = tensor.empty() : tensor<1x1x320x256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<256x1x1x320xi8>) outs(%1 : tensor<1x1x320x256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x320x256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x320x256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_90(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_90(%input0: tensor<256x1x1x512xi8>) -> (%output0: tensor<1x1x512x256xf32>)"}} {
  %cst = arith.constant 0.001953125 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<256x1x1x512xi8>
  %1 = tensor.empty() : tensor<1x1x512x256xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<256x1x1x512xi8>) outs(%1 : tensor<1x1x512x256xf32>) {
  ^bb0(%in: i8, %out: f32):
    %4 = arith.extsi %in : i8 to i32
    %5 = arith.sitofp %4 : i32 to f32
    %6 = arith.mulf %5, %cst : f32
    linalg.yield %6 : f32
  } -> tensor<1x1x512x256xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<1x1x512x256xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @jit_eval_91(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_91(%input0: tensor<21x256xi8>) -> (%output0: tensor<256x24xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 3.906250e-03 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<21x256xi8>
  %1 = tensor.empty() : tensor<256x21xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<21x256xi8>) outs(%1 : tensor<256x21xf32>) {
  ^bb0(%in: i8, %out: f32):
    %6 = arith.extsi %in : i8 to i32
    %7 = arith.sitofp %6 : i32 to f32
    %8 = arith.mulf %7, %cst_0 : f32
    linalg.yield %8 : f32
  } -> tensor<256x21xf32>
  %3 = tensor.empty() : tensor<256x24xf32>
  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<256x24xf32>) -> tensor<256x24xf32>
  %inserted_slice = tensor.insert_slice %2 into %4[0, 0] [256, 21] [1, 1] : tensor<256x21xf32> into tensor<256x24xf32>
  %5 = hal.tensor.export %inserted_slice "output0" : tensor<256x24xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @tf2onnx(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %0 = util.null : !hal.fence
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @tf2onnx$async(%arg0, %0, %fence) : (!hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegionsPass (iree-flow-form-dispatch-regions) //----- //
util.func public @tf2onnx$async(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant dense_resource<__elided__> : tensor<1x21xf32>
  %cst_0 = arith.constant dense_resource<__elided__> : tensor<1x256x1x1xf32>
  %cst_1 = arith.constant 0.999259948 : f32
  %cst_2 = arith.constant 6.000000e+00 : f32
  %cst_3 = arith.constant dense_resource<__elided__> : tensor<1x1x256x24xf32>
  %cst_4 = arith.constant 6.400000e+01 : f32
  %cst_5 = arith.constant -9.53674316E-7 : f32
  %cst_6 = arith.constant 7.89230776 : f32
  %cst_7 = arith.constant 6.500000e+01 : f32
  %cst_8 = arith.constant 7.812500e-03 : f32
  %cst_9 = arith.constant 1.250000e-01 : f32
  %cst_10 = arith.constant 2.500000e-01 : f32
  %cst_11 = arith.constant 6.250000e-02 : f32
  %cst_12 = arith.constant 3.125000e-02 : f32
  %cst_13 = arith.constant 5.000000e-01 : f32
  %cst_14 = arith.constant 1.000000e+00 : f32
  %cst_15 = arith.constant 0.000000e+00 : f32
  %cst_16 = arith.constant 1.270000e+02 : f32
  %cst_17 = arith.constant -1.280000e+02 : f32
  %cst_18 = arith.constant 4.225000e+03 : f32
  %c0 = arith.constant 0 : index
  %cst_19 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_20 = arith.constant dense_resource<__elided__> : tensor<576xf32>
  %cst_21 = arith.constant dense_resource<__elided__> : tensor<960xf32>
  %cst_22 = arith.constant dense_resource<__elided__> : tensor<144xf32>
  %cst_23 = arith.constant dense_resource<__elided__> : tensor<192xf32>
  %cst_24 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_25 = arith.constant dense_resource<__elided__> : tensor<256xf32>
  %cst_26 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_27 = arith.constant dense_resource<__elided__> : tensor<3x3x3x32xf32>
  %cst_28 = arith.constant dense_resource<__elided__> : tensor<32x3x3xf32>
  %cst_29 = arith.constant dense_resource<__elided__> : tensor<16xf32>
  %cst_30 = arith.constant dense_resource<__elided__> : tensor<96x3x3xf32>
  %cst_31 = arith.constant dense_resource<__elided__> : tensor<24xf32>
  %cst_32 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_33 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_34 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_35 = arith.constant dense_resource<__elided__> : tensor<576x3x3xf32>
  %cst_36 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_37 = arith.constant dense_resource<__elided__> : tensor<576x3x3xf32>
  %cst_38 = arith.constant dense_resource<__elided__> : tensor<576xf32>
  %cst_39 = arith.constant dense_resource<__elided__> : tensor<96xf32>
  %cst_40 = arith.constant dense_resource<__elided__> : tensor<576x3x3xf32>
  %cst_41 = arith.constant dense_resource<__elided__> : tensor<576xf32>
  %cst_42 = arith.constant dense_resource<__elided__> : tensor<160xf32>
  %cst_43 = arith.constant dense_resource<__elided__> : tensor<960x3x3xf32>
  %cst_44 = arith.constant dense_resource<__elided__> : tensor<160xf32>
  %cst_45 = arith.constant dense_resource<__elided__> : tensor<960x3x3xf32>
  %cst_46 = arith.constant dense_resource<__elided__> : tensor<960xf32>
  %cst_47 = arith.constant dense_resource<__elided__> : tensor<160xf32>
  %cst_48 = arith.constant dense_resource<__elided__> : tensor<960x3x3xf32>
  %cst_49 = arith.constant dense_resource<__elided__> : tensor<960xf32>
  %cst_50 = arith.constant dense_resource<__elided__> : tensor<320xf32>
  %cst_51 = arith.constant dense_resource<__elided__> : tensor<144x3x3xf32>
  %cst_52 = arith.constant dense_resource<__elided__> : tensor<24xf32>
  %cst_53 = arith.constant dense_resource<__elided__> : tensor<144x3x3xf32>
  %cst_54 = arith.constant dense_resource<__elided__> : tensor<144xf32>
  %cst_55 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_56 = arith.constant dense_resource<__elided__> : tensor<192x3x3xf32>
  %cst_57 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_58 = arith.constant dense_resource<__elided__> : tensor<192x3x3xf32>
  %cst_59 = arith.constant dense_resource<__elided__> : tensor<192xf32>
  %cst_60 = arith.constant dense_resource<__elided__> : tensor<32xf32>
  %cst_61 = arith.constant dense_resource<__elided__> : tensor<192x3x3xf32>
  %cst_62 = arith.constant dense_resource<__elided__> : tensor<192xf32>
  %cst_63 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_64 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_65 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_66 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_67 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_68 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_69 = arith.constant dense_resource<__elided__> : tensor<384x3x3xf32>
  %cst_70 = arith.constant dense_resource<__elided__> : tensor<384xf32>
  %cst_71 = arith.constant dense_resource<__elided__> : tensor<64xf32>
  %cst_72 = arith.constant dense_resource<__elided__> : tensor<256xf32>
  %cst_73 = arith.constant dense_resource<__elided__> : tensor<1x1x32x16xf32>
  %cst_74 = arith.constant dense_resource<__elided__> : tensor<1x1x16x96xf32>
  %cst_75 = arith.constant dense_resource<__elided__> : tensor<1x1x96x24xf32>
  %cst_76 = arith.constant dense_resource<__elided__> : tensor<1x1x24x144xf32>
  %cst_77 = arith.constant dense_resource<__elided__> : tensor<1x1x144x24xf32>
  %cst_78 = arith.constant dense_resource<__elided__> : tensor<1x1x24x144xf32>
  %cst_79 = arith.constant dense_resource<__elided__> : tensor<1x1x144x32xf32>
  %cst_80 = arith.constant dense_resource<__elided__> : tensor<1x1x32x192xf32>
  %cst_81 = arith.constant dense_resource<__elided__> : tensor<1x1x192x32xf32>
  %cst_82 = arith.constant dense_resource<__elided__> : tensor<1x1x32x192xf32>
  %cst_83 = arith.constant dense_resource<__elided__> : tensor<1x1x192x32xf32>
  %cst_84 = arith.constant dense_resource<__elided__> : tensor<1x1x32x192xf32>
  %cst_85 = arith.constant dense_resource<__elided__> : tensor<1x1x192x64xf32>
  %cst_86 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_87 = arith.constant dense_resource<__elided__> : tensor<1x1x384x64xf32>
  %cst_88 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_89 = arith.constant dense_resource<__elided__> : tensor<1x1x384x64xf32>
  %cst_90 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_91 = arith.constant dense_resource<__elided__> : tensor<1x1x384x64xf32>
  %cst_92 = arith.constant dense_resource<__elided__> : tensor<1x1x64x384xf32>
  %cst_93 = arith.constant dense_resource<__elided__> : tensor<1x1x384x96xf32>
  %cst_94 = arith.constant dense_resource<__elided__> : tensor<1x1x96x576xf32>
  %cst_95 = arith.constant dense_resource<__elided__> : tensor<1x1x576x96xf32>
  %cst_96 = arith.constant dense_resource<__elided__> : tensor<1x1x96x576xf32>
  %cst_97 = arith.constant dense_resource<__elided__> : tensor<1x1x576x96xf32>
  %cst_98 = arith.constant dense_resource<__elided__> : tensor<1x1x96x576xf32>
  %cst_99 = arith.constant dense_resource<__elided__> : tensor<1x1x576x160xf32>
  %cst_100 = arith.constant dense_resource<__elided__> : tensor<1x1x160x960xf32>
  %cst_101 = arith.constant dense_resource<__elided__> : tensor<1x1x960x160xf32>
  %cst_102 = arith.constant dense_resource<__elided__> : tensor<1x1x160x960xf32>
  %cst_103 = arith.constant dense_resource<__elided__> : tensor<1x1x960x160xf32>
  %cst_104 = arith.constant dense_resource<__elided__> : tensor<1x1x160x960xf32>
  %cst_105 = arith.constant dense_resource<__elided__> : tensor<1x1x960x320xf32>
  %cst_106 = arith.constant dense_resource<__elided__> : tensor<1x1x320x256xf32>
  %cst_107 = arith.constant dense_resource<__elided__> : tensor<1x1x320x256xf32>
  %cst_108 = arith.constant dense_resource<__elided__> : tensor<1x1x512x256xf32>
  %0 = hal.tensor.import wait(%arg1) => %arg0 : !hal.buffer_view -> tensor<1x513x513x3xf32>
  %expanded = tensor.expand_shape %0 [[0], [1], [2], [3, 4]] output_shape [1, 513, 513, 1, 3] : tensor<1x513x513x3xf32> into tensor<1x513x513x1x3xf32>
  %1 = tensor.empty() : tensor<1x513x513x1x3xf32>
  %2 = flow.dispatch.region -> (tensor<1x513x513x1x3xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<1x513x513x1x3xf32>) outs(%1 : tensor<1x513x513x1x3xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.divf %in, %cst_8 : f32
      %298 = math.round %297 : f32
      %299 = arith.addf %298, %cst_15 : f32
      %300 = arith.cmpf ult, %299, %cst_17 : f32
      %301 = arith.cmpf ugt, %299, %cst_16 : f32
      %302 = arith.select %300, %cst_17, %299 : f32
      %303 = arith.select %301, %cst_16, %302 : f32
      %304 = arith.fptosi %303 : f32 to i8
      %305 = arith.extsi %304 : i8 to i32
      %306 = arith.sitofp %305 : i32 to f32
      %307 = arith.mulf %306, %cst_8 : f32
      linalg.yield %307 : f32
    } -> tensor<1x513x513x1x3xf32>
    flow.return %296 : tensor<1x513x513x1x3xf32>
  }
  %collapsed = tensor.collapse_shape %2 [[0, 1], [2, 3], [4]] : tensor<1x513x513x1x3xf32> into tensor<513x513x3xf32>
  %3 = tensor.empty() : tensor<515x515x3xf32>
  %4 = linalg.fill ins(%cst_15 : f32) outs(%3 : tensor<515x515x3xf32>) -> tensor<515x515x3xf32>
  %inserted_slice = tensor.insert_slice %collapsed into %4[1, 1, 0] [513, 513, 3] [1, 1, 1] : tensor<513x513x3xf32> into tensor<515x515x3xf32>
  %expanded_109 = tensor.expand_shape %inserted_slice [[0, 1], [2], [3]] output_shape [1, 515, 515, 3] : tensor<515x515x3xf32> into tensor<1x515x515x3xf32>
  %5 = tensor.empty() : tensor<1x257x257x32xf32>
  %6 = linalg.fill ins(%cst_15 : f32) outs(%5 : tensor<1x257x257x32xf32>) -> tensor<1x257x257x32xf32>
  %7 = flow.dispatch.region -> (tensor<1x257x257x32xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%expanded_109, %cst_27 : tensor<1x515x515x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x257x257x32xf32>) -> tensor<1x257x257x32xf32>
    %297 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%296, %cst_26 : tensor<1x257x257x32xf32>, tensor<32xf32>) outs(%5 : tensor<1x257x257x32xf32>) {
    ^bb0(%in: f32, %in_256: f32, %out: f32):
      %298 = arith.addf %in, %in_256 : f32
      %299 = arith.cmpf ult, %298, %cst_15 : f32
      %300 = arith.select %299, %cst_15, %298 : f32
      %301 = arith.cmpf ugt, %300, %cst_2 : f32
      %302 = arith.select %301, %cst_2, %300 : f32
      %303 = arith.divf %302, %cst_11 : f32
      %304 = math.round %303 : f32
      %305 = arith.addf %304, %cst_15 : f32
      %306 = arith.cmpf ult, %305, %cst_17 : f32
      %307 = arith.cmpf ugt, %305, %cst_16 : f32
      %308 = arith.select %306, %cst_17, %305 : f32
      %309 = arith.select %307, %cst_16, %308 : f32
      %310 = arith.fptosi %309 : f32 to i8
      %311 = arith.extsi %310 : i8 to i32
      %312 = arith.sitofp %311 : i32 to f32
      %313 = arith.mulf %312, %cst_11 : f32
      linalg.yield %313 : f32
    } -> tensor<1x257x257x32xf32>
    flow.return %297 : tensor<1x257x257x32xf32>
  }
  %collapsed_110 = tensor.collapse_shape %7 [[0, 1], [2], [3]] : tensor<1x257x257x32xf32> into tensor<257x257x32xf32>
  %8 = tensor.empty() : tensor<259x259x32xf32>
  %9 = linalg.fill ins(%cst_15 : f32) outs(%8 : tensor<259x259x32xf32>) -> tensor<259x259x32xf32>
  %inserted_slice_111 = tensor.insert_slice %collapsed_110 into %9[1, 1, 0] [257, 257, 32] [1, 1, 1] : tensor<257x257x32xf32> into tensor<259x259x32xf32>
  %expanded_112 = tensor.expand_shape %inserted_slice_111 [[0], [1], [2, 3]] output_shape [259, 259, 1, 32] : tensor<259x259x32xf32> into tensor<259x259x1x32xf32>
  %10 = tensor.empty() : tensor<1x32x259x259xf32>
  %11 = flow.dispatch.region -> (tensor<1x32x259x259xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_112 : tensor<259x259x1x32xf32>) outs(%10 : tensor<1x32x259x259xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x32x259x259xf32>
    flow.return %296 : tensor<1x32x259x259xf32>
  }
  %12 = tensor.empty() : tensor<1x32x257x257xf32>
  %13 = linalg.fill ins(%cst_15 : f32) outs(%12 : tensor<1x32x257x257xf32>) -> tensor<1x32x257x257xf32>
  %14 = flow.dispatch.region -> (tensor<1x32x257x257xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%11, %cst_28 : tensor<1x32x259x259xf32>, tensor<32x3x3xf32>) outs(%13 : tensor<1x32x257x257xf32>) -> tensor<1x32x257x257xf32>
    flow.return %296 : tensor<1x32x257x257xf32>
  }
  %15 = tensor.empty() : tensor<257x257x1x32xf32>
  %16 = flow.dispatch.region -> (tensor<257x257x1x32xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14 : tensor<1x32x257x257xf32>) outs(%15 : tensor<257x257x1x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<257x257x1x32xf32>
    flow.return %296 : tensor<257x257x1x32xf32>
  }
  %collapsed_113 = tensor.collapse_shape %16 [[0], [1], [2, 3]] : tensor<257x257x1x32xf32> into tensor<257x257x32xf32>
  %17 = tensor.empty() : tensor<257x257x16xf32>
  %18 = tensor.empty() : tensor<260x260x32xf32>
  %19 = linalg.fill ins(%cst_15 : f32) outs(%18 : tensor<260x260x32xf32>) -> tensor<260x260x32xf32>
  %inserted_slice_114 = tensor.insert_slice %collapsed_113 into %19[0, 0, 0] [257, 257, 32] [1, 1, 1] : tensor<257x257x32xf32> into tensor<260x260x32xf32>
  %expanded_115 = tensor.expand_shape %inserted_slice_114 [[0, 1], [2], [3]] output_shape [1, 260, 260, 32] : tensor<260x260x32xf32> into tensor<1x260x260x32xf32>
  %20 = tensor.empty() : tensor<1x260x260x16xf32>
  %21 = linalg.fill ins(%cst_15 : f32) outs(%20 : tensor<1x260x260x16xf32>) -> tensor<1x260x260x16xf32>
  %22 = flow.dispatch.region -> (tensor<1x260x260x16xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_115, %cst_73 : tensor<1x260x260x32xf32>, tensor<1x1x32x16xf32>) outs(%21 : tensor<1x260x260x16xf32>) -> tensor<1x260x260x16xf32>
    flow.return %296 : tensor<1x260x260x16xf32>
  }
  %extracted_slice = tensor.extract_slice %22[0, 0, 0, 0] [1, 257, 257, 16] [1, 1, 1, 1] : tensor<1x260x260x16xf32> to tensor<257x257x16xf32>
  %23 = tensor.empty() : tensor<257x257x96xf32>
  %24 = tensor.empty() : tensor<260x260x16xf32>
  %25 = linalg.fill ins(%cst_15 : f32) outs(%24 : tensor<260x260x16xf32>) -> tensor<260x260x16xf32>
  %26 = flow.dispatch.region -> (tensor<260x260x16xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice, %cst_29 : tensor<257x257x16xf32>, tensor<16xf32>) outs(%17 : tensor<257x257x16xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_10 : f32
      linalg.yield %308 : f32
    } -> tensor<257x257x16xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %25[0, 0, 0] [257, 257, 16] [1, 1, 1] : tensor<257x257x16xf32> into tensor<260x260x16xf32>
    flow.return %inserted_slice_256 : tensor<260x260x16xf32>
  }
  %expanded_116 = tensor.expand_shape %26 [[0, 1], [2], [3]] output_shape [1, 260, 260, 16] : tensor<260x260x16xf32> into tensor<1x260x260x16xf32>
  %27 = tensor.empty() : tensor<1x260x260x96xf32>
  %28 = linalg.fill ins(%cst_15 : f32) outs(%27 : tensor<1x260x260x96xf32>) -> tensor<1x260x260x96xf32>
  %29 = flow.dispatch.region -> (tensor<1x260x260x96xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_116, %cst_74 : tensor<1x260x260x16xf32>, tensor<1x1x16x96xf32>) outs(%28 : tensor<1x260x260x96xf32>) -> tensor<1x260x260x96xf32>
    flow.return %296 : tensor<1x260x260x96xf32>
  }
  %extracted_slice_117 = tensor.extract_slice %29[0, 0, 0, 0] [1, 257, 257, 96] [1, 1, 1, 1] : tensor<1x260x260x96xf32> to tensor<257x257x96xf32>
  %30 = tensor.empty() : tensor<259x259x96xf32>
  %31 = linalg.fill ins(%cst_15 : f32) outs(%30 : tensor<259x259x96xf32>) -> tensor<259x259x96xf32>
  %32 = flow.dispatch.region -> (tensor<259x259x96xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_117, %cst_19 : tensor<257x257x96xf32>, tensor<96xf32>) outs(%23 : tensor<257x257x96xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<257x257x96xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %31[1, 1, 0] [257, 257, 96] [1, 1, 1] : tensor<257x257x96xf32> into tensor<259x259x96xf32>
    flow.return %inserted_slice_256 : tensor<259x259x96xf32>
  }
  %expanded_118 = tensor.expand_shape %32 [[0], [1], [2, 3]] output_shape [259, 259, 1, 96] : tensor<259x259x96xf32> into tensor<259x259x1x96xf32>
  %33 = tensor.empty() : tensor<1x96x259x259xf32>
  %34 = flow.dispatch.region -> (tensor<1x96x259x259xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_118 : tensor<259x259x1x96xf32>) outs(%33 : tensor<1x96x259x259xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x96x259x259xf32>
    flow.return %296 : tensor<1x96x259x259xf32>
  }
  %35 = tensor.empty() : tensor<1x96x129x129xf32>
  %36 = linalg.fill ins(%cst_15 : f32) outs(%35 : tensor<1x96x129x129xf32>) -> tensor<1x96x129x129xf32>
  %37 = flow.dispatch.region -> (tensor<1x96x129x129xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%34, %cst_30 : tensor<1x96x259x259xf32>, tensor<96x3x3xf32>) outs(%36 : tensor<1x96x129x129xf32>) -> tensor<1x96x129x129xf32>
    flow.return %296 : tensor<1x96x129x129xf32>
  }
  %38 = tensor.empty() : tensor<129x129x1x96xf32>
  %39 = flow.dispatch.region -> (tensor<129x129x1x96xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37 : tensor<1x96x129x129xf32>) outs(%38 : tensor<129x129x1x96xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<129x129x1x96xf32>
    flow.return %296 : tensor<129x129x1x96xf32>
  }
  %collapsed_119 = tensor.collapse_shape %39 [[0], [1], [2, 3]] : tensor<129x129x1x96xf32> into tensor<129x129x96xf32>
  %40 = tensor.empty() : tensor<129x129x24xf32>
  %41 = tensor.empty() : tensor<132x132x96xf32>
  %42 = linalg.fill ins(%cst_15 : f32) outs(%41 : tensor<132x132x96xf32>) -> tensor<132x132x96xf32>
  %inserted_slice_120 = tensor.insert_slice %collapsed_119 into %42[0, 0, 0] [129, 129, 96] [1, 1, 1] : tensor<129x129x96xf32> into tensor<132x132x96xf32>
  %expanded_121 = tensor.expand_shape %inserted_slice_120 [[0, 1], [2], [3]] output_shape [1, 132, 132, 96] : tensor<132x132x96xf32> into tensor<1x132x132x96xf32>
  %43 = tensor.empty() : tensor<1x132x132x24xf32>
  %44 = linalg.fill ins(%cst_15 : f32) outs(%43 : tensor<1x132x132x24xf32>) -> tensor<1x132x132x24xf32>
  %45 = flow.dispatch.region -> (tensor<1x132x132x24xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_121, %cst_75 : tensor<1x132x132x96xf32>, tensor<1x1x96x24xf32>) outs(%44 : tensor<1x132x132x24xf32>) -> tensor<1x132x132x24xf32>
    flow.return %296 : tensor<1x132x132x24xf32>
  }
  %extracted_slice_122 = tensor.extract_slice %45[0, 0, 0, 0] [1, 129, 129, 24] [1, 1, 1, 1] : tensor<1x132x132x24xf32> to tensor<129x129x24xf32>
  %46 = tensor.empty() : tensor<129x129x24xi8>
  %47 = flow.dispatch.region -> (tensor<129x129x24xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_122, %cst_31 : tensor<129x129x24xf32>, tensor<24xf32>) outs(%46 : tensor<129x129x24xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<129x129x24xi8>
    flow.return %296 : tensor<129x129x24xi8>
  }
  %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%47 : tensor<129x129x24xi8>) outs(%40 : tensor<129x129x24xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_10 : f32
    linalg.yield %298 : f32
  } -> tensor<129x129x24xf32>
  %49 = tensor.empty() : tensor<129x129x144xf32>
  %50 = tensor.empty() : tensor<132x132x24xf32>
  %51 = linalg.fill ins(%cst_15 : f32) outs(%50 : tensor<132x132x24xf32>) -> tensor<132x132x24xf32>
  %inserted_slice_123 = tensor.insert_slice %48 into %51[0, 0, 0] [129, 129, 24] [1, 1, 1] : tensor<129x129x24xf32> into tensor<132x132x24xf32>
  %expanded_124 = tensor.expand_shape %inserted_slice_123 [[0, 1], [2], [3]] output_shape [1, 132, 132, 24] : tensor<132x132x24xf32> into tensor<1x132x132x24xf32>
  %52 = tensor.empty() : tensor<1x132x132x144xf32>
  %53 = linalg.fill ins(%cst_15 : f32) outs(%52 : tensor<1x132x132x144xf32>) -> tensor<1x132x132x144xf32>
  %54 = flow.dispatch.region -> (tensor<1x132x132x144xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_124, %cst_76 : tensor<1x132x132x24xf32>, tensor<1x1x24x144xf32>) outs(%53 : tensor<1x132x132x144xf32>) -> tensor<1x132x132x144xf32>
    flow.return %296 : tensor<1x132x132x144xf32>
  }
  %extracted_slice_125 = tensor.extract_slice %54[0, 0, 0, 0] [1, 129, 129, 144] [1, 1, 1, 1] : tensor<1x132x132x144xf32> to tensor<129x129x144xf32>
  %55 = tensor.empty() : tensor<131x131x144xf32>
  %56 = linalg.fill ins(%cst_15 : f32) outs(%55 : tensor<131x131x144xf32>) -> tensor<131x131x144xf32>
  %57 = flow.dispatch.region -> (tensor<131x131x144xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_125, %cst_22 : tensor<129x129x144xf32>, tensor<144xf32>) outs(%49 : tensor<129x129x144xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<129x129x144xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %56[1, 1, 0] [129, 129, 144] [1, 1, 1] : tensor<129x129x144xf32> into tensor<131x131x144xf32>
    flow.return %inserted_slice_256 : tensor<131x131x144xf32>
  }
  %expanded_126 = tensor.expand_shape %57 [[0], [1], [2, 3]] output_shape [131, 131, 1, 144] : tensor<131x131x144xf32> into tensor<131x131x1x144xf32>
  %58 = tensor.empty() : tensor<1x144x131x131xf32>
  %59 = flow.dispatch.region -> (tensor<1x144x131x131xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_126 : tensor<131x131x1x144xf32>) outs(%58 : tensor<1x144x131x131xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x144x131x131xf32>
    flow.return %296 : tensor<1x144x131x131xf32>
  }
  %60 = tensor.empty() : tensor<1x144x129x129xf32>
  %61 = linalg.fill ins(%cst_15 : f32) outs(%60 : tensor<1x144x129x129xf32>) -> tensor<1x144x129x129xf32>
  %62 = flow.dispatch.region -> (tensor<1x144x129x129xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%59, %cst_51 : tensor<1x144x131x131xf32>, tensor<144x3x3xf32>) outs(%61 : tensor<1x144x129x129xf32>) -> tensor<1x144x129x129xf32>
    flow.return %296 : tensor<1x144x129x129xf32>
  }
  %63 = tensor.empty() : tensor<129x129x1x144xf32>
  %64 = flow.dispatch.region -> (tensor<129x129x1x144xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%62 : tensor<1x144x129x129xf32>) outs(%63 : tensor<129x129x1x144xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<129x129x1x144xf32>
    flow.return %296 : tensor<129x129x1x144xf32>
  }
  %collapsed_127 = tensor.collapse_shape %64 [[0], [1], [2, 3]] : tensor<129x129x1x144xf32> into tensor<129x129x144xf32>
  %65 = tensor.empty() : tensor<132x132x144xf32>
  %66 = linalg.fill ins(%cst_15 : f32) outs(%65 : tensor<132x132x144xf32>) -> tensor<132x132x144xf32>
  %inserted_slice_128 = tensor.insert_slice %collapsed_127 into %66[0, 0, 0] [129, 129, 144] [1, 1, 1] : tensor<129x129x144xf32> into tensor<132x132x144xf32>
  %expanded_129 = tensor.expand_shape %inserted_slice_128 [[0, 1], [2], [3]] output_shape [1, 132, 132, 144] : tensor<132x132x144xf32> into tensor<1x132x132x144xf32>
  %67 = flow.dispatch.region -> (tensor<1x132x132x24xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_129, %cst_77 : tensor<1x132x132x144xf32>, tensor<1x1x144x24xf32>) outs(%44 : tensor<1x132x132x24xf32>) -> tensor<1x132x132x24xf32>
    flow.return %296 : tensor<1x132x132x24xf32>
  }
  %extracted_slice_130 = tensor.extract_slice %67[0, 0, 0, 0] [1, 129, 129, 24] [1, 1, 1, 1] : tensor<1x132x132x24xf32> to tensor<129x129x24xf32>
  %68 = flow.dispatch.region -> (tensor<132x132x24xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%48, %extracted_slice_130, %cst_52 : tensor<129x129x24xf32>, tensor<129x129x24xf32>, tensor<24xf32>) outs(%40 : tensor<129x129x24xf32>) {
    ^bb0(%in: f32, %in_257: f32, %in_258: f32, %out: f32):
      %297 = arith.addf %in_257, %in_258 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_10 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_13 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      %318 = arith.extsi %317 : i8 to i32
      %319 = arith.sitofp %318 : i32 to f32
      %320 = arith.mulf %319, %cst_13 : f32
      linalg.yield %320 : f32
    } -> tensor<129x129x24xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %51[0, 0, 0] [129, 129, 24] [1, 1, 1] : tensor<129x129x24xf32> into tensor<132x132x24xf32>
    flow.return %inserted_slice_256 : tensor<132x132x24xf32>
  }
  %expanded_131 = tensor.expand_shape %68 [[0, 1], [2], [3]] output_shape [1, 132, 132, 24] : tensor<132x132x24xf32> into tensor<1x132x132x24xf32>
  %69 = flow.dispatch.region -> (tensor<1x132x132x144xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_131, %cst_78 : tensor<1x132x132x24xf32>, tensor<1x1x24x144xf32>) outs(%53 : tensor<1x132x132x144xf32>) -> tensor<1x132x132x144xf32>
    flow.return %296 : tensor<1x132x132x144xf32>
  }
  %extracted_slice_132 = tensor.extract_slice %69[0, 0, 0, 0] [1, 129, 129, 144] [1, 1, 1, 1] : tensor<1x132x132x144xf32> to tensor<129x129x144xf32>
  %70 = flow.dispatch.region -> (tensor<131x131x144xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_132, %cst_54 : tensor<129x129x144xf32>, tensor<144xf32>) outs(%49 : tensor<129x129x144xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<129x129x144xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %56[1, 1, 0] [129, 129, 144] [1, 1, 1] : tensor<129x129x144xf32> into tensor<131x131x144xf32>
    flow.return %inserted_slice_256 : tensor<131x131x144xf32>
  }
  %expanded_133 = tensor.expand_shape %70 [[0], [1], [2, 3]] output_shape [131, 131, 1, 144] : tensor<131x131x144xf32> into tensor<131x131x1x144xf32>
  %71 = flow.dispatch.region -> (tensor<1x144x131x131xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_133 : tensor<131x131x1x144xf32>) outs(%58 : tensor<1x144x131x131xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x144x131x131xf32>
    flow.return %296 : tensor<1x144x131x131xf32>
  }
  %72 = tensor.empty() : tensor<1x144x65x65xf32>
  %73 = linalg.fill ins(%cst_15 : f32) outs(%72 : tensor<1x144x65x65xf32>) -> tensor<1x144x65x65xf32>
  %74 = flow.dispatch.region -> (tensor<1x144x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%71, %cst_53 : tensor<1x144x131x131xf32>, tensor<144x3x3xf32>) outs(%73 : tensor<1x144x65x65xf32>) -> tensor<1x144x65x65xf32>
    flow.return %296 : tensor<1x144x65x65xf32>
  }
  %75 = tensor.empty() : tensor<65x65x1x144xf32>
  %76 = flow.dispatch.region -> (tensor<65x65x1x144xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%74 : tensor<1x144x65x65xf32>) outs(%75 : tensor<65x65x1x144xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x144xf32>
    flow.return %296 : tensor<65x65x1x144xf32>
  }
  %collapsed_134 = tensor.collapse_shape %76 [[0], [1], [2, 3]] : tensor<65x65x1x144xf32> into tensor<65x65x144xf32>
  %77 = tensor.empty() : tensor<65x65x32xf32>
  %78 = tensor.empty() : tensor<68x68x144xf32>
  %79 = linalg.fill ins(%cst_15 : f32) outs(%78 : tensor<68x68x144xf32>) -> tensor<68x68x144xf32>
  %inserted_slice_135 = tensor.insert_slice %collapsed_134 into %79[0, 0, 0] [65, 65, 144] [1, 1, 1] : tensor<65x65x144xf32> into tensor<68x68x144xf32>
  %expanded_136 = tensor.expand_shape %inserted_slice_135 [[0, 1], [2], [3]] output_shape [1, 68, 68, 144] : tensor<68x68x144xf32> into tensor<1x68x68x144xf32>
  %80 = tensor.empty() : tensor<1x68x68x32xf32>
  %81 = linalg.fill ins(%cst_15 : f32) outs(%80 : tensor<1x68x68x32xf32>) -> tensor<1x68x68x32xf32>
  %82 = flow.dispatch.region -> (tensor<1x68x68x32xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_136, %cst_79 : tensor<1x68x68x144xf32>, tensor<1x1x144x32xf32>) outs(%81 : tensor<1x68x68x32xf32>) -> tensor<1x68x68x32xf32>
    flow.return %296 : tensor<1x68x68x32xf32>
  }
  %extracted_slice_137 = tensor.extract_slice %82[0, 0, 0, 0] [1, 65, 65, 32] [1, 1, 1, 1] : tensor<1x68x68x32xf32> to tensor<65x65x32xf32>
  %83 = tensor.empty() : tensor<65x65x32xi8>
  %84 = flow.dispatch.region -> (tensor<65x65x32xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_137, %cst_55 : tensor<65x65x32xf32>, tensor<32xf32>) outs(%83 : tensor<65x65x32xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<65x65x32xi8>
    flow.return %296 : tensor<65x65x32xi8>
  }
  %85 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%84 : tensor<65x65x32xi8>) outs(%77 : tensor<65x65x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_10 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x32xf32>
  %86 = tensor.empty() : tensor<65x65x192xf32>
  %87 = tensor.empty() : tensor<68x68x32xf32>
  %88 = linalg.fill ins(%cst_15 : f32) outs(%87 : tensor<68x68x32xf32>) -> tensor<68x68x32xf32>
  %inserted_slice_138 = tensor.insert_slice %85 into %88[0, 0, 0] [65, 65, 32] [1, 1, 1] : tensor<65x65x32xf32> into tensor<68x68x32xf32>
  %expanded_139 = tensor.expand_shape %inserted_slice_138 [[0, 1], [2], [3]] output_shape [1, 68, 68, 32] : tensor<68x68x32xf32> into tensor<1x68x68x32xf32>
  %89 = tensor.empty() : tensor<1x68x68x192xf32>
  %90 = linalg.fill ins(%cst_15 : f32) outs(%89 : tensor<1x68x68x192xf32>) -> tensor<1x68x68x192xf32>
  %91 = flow.dispatch.region -> (tensor<1x68x68x192xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_139, %cst_80 : tensor<1x68x68x32xf32>, tensor<1x1x32x192xf32>) outs(%90 : tensor<1x68x68x192xf32>) -> tensor<1x68x68x192xf32>
    flow.return %296 : tensor<1x68x68x192xf32>
  }
  %extracted_slice_140 = tensor.extract_slice %91[0, 0, 0, 0] [1, 65, 65, 192] [1, 1, 1, 1] : tensor<1x68x68x192xf32> to tensor<65x65x192xf32>
  %92 = tensor.empty() : tensor<67x67x192xf32>
  %93 = linalg.fill ins(%cst_15 : f32) outs(%92 : tensor<67x67x192xf32>) -> tensor<67x67x192xf32>
  %94 = flow.dispatch.region -> (tensor<67x67x192xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_140, %cst_23 : tensor<65x65x192xf32>, tensor<192xf32>) outs(%86 : tensor<65x65x192xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x192xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %93[1, 1, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<67x67x192xf32>
    flow.return %inserted_slice_256 : tensor<67x67x192xf32>
  }
  %expanded_141 = tensor.expand_shape %94 [[0], [1], [2, 3]] output_shape [67, 67, 1, 192] : tensor<67x67x192xf32> into tensor<67x67x1x192xf32>
  %95 = tensor.empty() : tensor<1x192x67x67xf32>
  %96 = flow.dispatch.region -> (tensor<1x192x67x67xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_141 : tensor<67x67x1x192xf32>) outs(%95 : tensor<1x192x67x67xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x192x67x67xf32>
    flow.return %296 : tensor<1x192x67x67xf32>
  }
  %97 = tensor.empty() : tensor<1x192x65x65xf32>
  %98 = linalg.fill ins(%cst_15 : f32) outs(%97 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
  %99 = flow.dispatch.region -> (tensor<1x192x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%96, %cst_56 : tensor<1x192x67x67xf32>, tensor<192x3x3xf32>) outs(%98 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
    flow.return %296 : tensor<1x192x65x65xf32>
  }
  %100 = tensor.empty() : tensor<65x65x1x192xf32>
  %101 = flow.dispatch.region -> (tensor<65x65x1x192xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%99 : tensor<1x192x65x65xf32>) outs(%100 : tensor<65x65x1x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x192xf32>
    flow.return %296 : tensor<65x65x1x192xf32>
  }
  %collapsed_142 = tensor.collapse_shape %101 [[0], [1], [2, 3]] : tensor<65x65x1x192xf32> into tensor<65x65x192xf32>
  %102 = tensor.empty() : tensor<68x68x192xf32>
  %103 = linalg.fill ins(%cst_15 : f32) outs(%102 : tensor<68x68x192xf32>) -> tensor<68x68x192xf32>
  %inserted_slice_143 = tensor.insert_slice %collapsed_142 into %103[0, 0, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<68x68x192xf32>
  %expanded_144 = tensor.expand_shape %inserted_slice_143 [[0, 1], [2], [3]] output_shape [1, 68, 68, 192] : tensor<68x68x192xf32> into tensor<1x68x68x192xf32>
  %104 = flow.dispatch.region -> (tensor<1x68x68x32xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_144, %cst_81 : tensor<1x68x68x192xf32>, tensor<1x1x192x32xf32>) outs(%81 : tensor<1x68x68x32xf32>) -> tensor<1x68x68x32xf32>
    flow.return %296 : tensor<1x68x68x32xf32>
  }
  %extracted_slice_145 = tensor.extract_slice %104[0, 0, 0, 0] [1, 65, 65, 32] [1, 1, 1, 1] : tensor<1x68x68x32xf32> to tensor<65x65x32xf32>
  %105 = flow.dispatch.region -> (tensor<65x65x32xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%85, %extracted_slice_145, %cst_57 : tensor<65x65x32xf32>, tensor<65x65x32xf32>, tensor<32xf32>) outs(%83 : tensor<65x65x32xi8>) {
    ^bb0(%in: f32, %in_256: f32, %in_257: f32, %out: i8):
      %297 = arith.addf %in_256, %in_257 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_10 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_10 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      linalg.yield %317 : i8
    } -> tensor<65x65x32xi8>
    flow.return %296 : tensor<65x65x32xi8>
  }
  %106 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%105 : tensor<65x65x32xi8>) outs(%77 : tensor<65x65x32xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_10 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x32xf32>
  %inserted_slice_146 = tensor.insert_slice %106 into %88[0, 0, 0] [65, 65, 32] [1, 1, 1] : tensor<65x65x32xf32> into tensor<68x68x32xf32>
  %expanded_147 = tensor.expand_shape %inserted_slice_146 [[0, 1], [2], [3]] output_shape [1, 68, 68, 32] : tensor<68x68x32xf32> into tensor<1x68x68x32xf32>
  %107 = flow.dispatch.region -> (tensor<1x68x68x192xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_147, %cst_82 : tensor<1x68x68x32xf32>, tensor<1x1x32x192xf32>) outs(%90 : tensor<1x68x68x192xf32>) -> tensor<1x68x68x192xf32>
    flow.return %296 : tensor<1x68x68x192xf32>
  }
  %extracted_slice_148 = tensor.extract_slice %107[0, 0, 0, 0] [1, 65, 65, 192] [1, 1, 1, 1] : tensor<1x68x68x192xf32> to tensor<65x65x192xf32>
  %108 = flow.dispatch.region -> (tensor<67x67x192xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_148, %cst_59 : tensor<65x65x192xf32>, tensor<192xf32>) outs(%86 : tensor<65x65x192xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x192xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %93[1, 1, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<67x67x192xf32>
    flow.return %inserted_slice_256 : tensor<67x67x192xf32>
  }
  %expanded_149 = tensor.expand_shape %108 [[0], [1], [2, 3]] output_shape [67, 67, 1, 192] : tensor<67x67x192xf32> into tensor<67x67x1x192xf32>
  %109 = flow.dispatch.region -> (tensor<1x192x67x67xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_149 : tensor<67x67x1x192xf32>) outs(%95 : tensor<1x192x67x67xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x192x67x67xf32>
    flow.return %296 : tensor<1x192x67x67xf32>
  }
  %110 = flow.dispatch.region -> (tensor<1x192x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%109, %cst_58 : tensor<1x192x67x67xf32>, tensor<192x3x3xf32>) outs(%98 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
    flow.return %296 : tensor<1x192x65x65xf32>
  }
  %111 = flow.dispatch.region -> (tensor<65x65x1x192xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%110 : tensor<1x192x65x65xf32>) outs(%100 : tensor<65x65x1x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x192xf32>
    flow.return %296 : tensor<65x65x1x192xf32>
  }
  %collapsed_150 = tensor.collapse_shape %111 [[0], [1], [2, 3]] : tensor<65x65x1x192xf32> into tensor<65x65x192xf32>
  %inserted_slice_151 = tensor.insert_slice %collapsed_150 into %103[0, 0, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<68x68x192xf32>
  %expanded_152 = tensor.expand_shape %inserted_slice_151 [[0, 1], [2], [3]] output_shape [1, 68, 68, 192] : tensor<68x68x192xf32> into tensor<1x68x68x192xf32>
  %112 = flow.dispatch.region -> (tensor<1x68x68x32xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_152, %cst_83 : tensor<1x68x68x192xf32>, tensor<1x1x192x32xf32>) outs(%81 : tensor<1x68x68x32xf32>) -> tensor<1x68x68x32xf32>
    flow.return %296 : tensor<1x68x68x32xf32>
  }
  %extracted_slice_153 = tensor.extract_slice %112[0, 0, 0, 0] [1, 65, 65, 32] [1, 1, 1, 1] : tensor<1x68x68x32xf32> to tensor<65x65x32xf32>
  %113 = flow.dispatch.region -> (tensor<68x68x32xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%106, %extracted_slice_153, %cst_60 : tensor<65x65x32xf32>, tensor<65x65x32xf32>, tensor<32xf32>) outs(%77 : tensor<65x65x32xf32>) {
    ^bb0(%in: f32, %in_257: f32, %in_258: f32, %out: f32):
      %297 = arith.addf %in_257, %in_258 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_10 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_10 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      %318 = arith.extsi %317 : i8 to i32
      %319 = arith.sitofp %318 : i32 to f32
      %320 = arith.mulf %319, %cst_10 : f32
      linalg.yield %320 : f32
    } -> tensor<65x65x32xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %88[0, 0, 0] [65, 65, 32] [1, 1, 1] : tensor<65x65x32xf32> into tensor<68x68x32xf32>
    flow.return %inserted_slice_256 : tensor<68x68x32xf32>
  }
  %expanded_154 = tensor.expand_shape %113 [[0, 1], [2], [3]] output_shape [1, 68, 68, 32] : tensor<68x68x32xf32> into tensor<1x68x68x32xf32>
  %114 = flow.dispatch.region -> (tensor<1x68x68x192xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_154, %cst_84 : tensor<1x68x68x32xf32>, tensor<1x1x32x192xf32>) outs(%90 : tensor<1x68x68x192xf32>) -> tensor<1x68x68x192xf32>
    flow.return %296 : tensor<1x68x68x192xf32>
  }
  %extracted_slice_155 = tensor.extract_slice %114[0, 0, 0, 0] [1, 65, 65, 192] [1, 1, 1, 1] : tensor<1x68x68x192xf32> to tensor<65x65x192xf32>
  %115 = flow.dispatch.region -> (tensor<67x67x192xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_155, %cst_62 : tensor<65x65x192xf32>, tensor<192xf32>) outs(%86 : tensor<65x65x192xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x192xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %93[1, 1, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<67x67x192xf32>
    flow.return %inserted_slice_256 : tensor<67x67x192xf32>
  }
  %expanded_156 = tensor.expand_shape %115 [[0], [1], [2, 3]] output_shape [67, 67, 1, 192] : tensor<67x67x192xf32> into tensor<67x67x1x192xf32>
  %116 = flow.dispatch.region -> (tensor<1x192x67x67xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_156 : tensor<67x67x1x192xf32>) outs(%95 : tensor<1x192x67x67xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x192x67x67xf32>
    flow.return %296 : tensor<1x192x67x67xf32>
  }
  %117 = flow.dispatch.region -> (tensor<1x192x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%116, %cst_61 : tensor<1x192x67x67xf32>, tensor<192x3x3xf32>) outs(%98 : tensor<1x192x65x65xf32>) -> tensor<1x192x65x65xf32>
    flow.return %296 : tensor<1x192x65x65xf32>
  }
  %118 = flow.dispatch.region -> (tensor<65x65x1x192xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%117 : tensor<1x192x65x65xf32>) outs(%100 : tensor<65x65x1x192xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x192xf32>
    flow.return %296 : tensor<65x65x1x192xf32>
  }
  %collapsed_157 = tensor.collapse_shape %118 [[0], [1], [2, 3]] : tensor<65x65x1x192xf32> into tensor<65x65x192xf32>
  %119 = tensor.empty() : tensor<65x65x64xf32>
  %inserted_slice_158 = tensor.insert_slice %collapsed_157 into %103[0, 0, 0] [65, 65, 192] [1, 1, 1] : tensor<65x65x192xf32> into tensor<68x68x192xf32>
  %expanded_159 = tensor.expand_shape %inserted_slice_158 [[0, 1], [2], [3]] output_shape [1, 68, 68, 192] : tensor<68x68x192xf32> into tensor<1x68x68x192xf32>
  %120 = tensor.empty() : tensor<1x68x68x64xf32>
  %121 = linalg.fill ins(%cst_15 : f32) outs(%120 : tensor<1x68x68x64xf32>) -> tensor<1x68x68x64xf32>
  %122 = flow.dispatch.region -> (tensor<1x68x68x64xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_159, %cst_85 : tensor<1x68x68x192xf32>, tensor<1x1x192x64xf32>) outs(%121 : tensor<1x68x68x64xf32>) -> tensor<1x68x68x64xf32>
    flow.return %296 : tensor<1x68x68x64xf32>
  }
  %extracted_slice_160 = tensor.extract_slice %122[0, 0, 0, 0] [1, 65, 65, 64] [1, 1, 1, 1] : tensor<1x68x68x64xf32> to tensor<65x65x64xf32>
  %123 = tensor.empty() : tensor<65x65x64xi8>
  %124 = flow.dispatch.region -> (tensor<65x65x64xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_160, %cst_63 : tensor<65x65x64xf32>, tensor<64xf32>) outs(%123 : tensor<65x65x64xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<65x65x64xi8>
    flow.return %296 : tensor<65x65x64xi8>
  }
  %125 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%124 : tensor<65x65x64xi8>) outs(%119 : tensor<65x65x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_9 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x64xf32>
  %126 = tensor.empty() : tensor<65x65x384xf32>
  %127 = tensor.empty() : tensor<68x68x64xf32>
  %128 = linalg.fill ins(%cst_15 : f32) outs(%127 : tensor<68x68x64xf32>) -> tensor<68x68x64xf32>
  %inserted_slice_161 = tensor.insert_slice %125 into %128[0, 0, 0] [65, 65, 64] [1, 1, 1] : tensor<65x65x64xf32> into tensor<68x68x64xf32>
  %expanded_162 = tensor.expand_shape %inserted_slice_161 [[0, 1], [2], [3]] output_shape [1, 68, 68, 64] : tensor<68x68x64xf32> into tensor<1x68x68x64xf32>
  %129 = tensor.empty() : tensor<1x68x68x384xf32>
  %130 = linalg.fill ins(%cst_15 : f32) outs(%129 : tensor<1x68x68x384xf32>) -> tensor<1x68x68x384xf32>
  %131 = flow.dispatch.region -> (tensor<1x68x68x384xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_162, %cst_86 : tensor<1x68x68x64xf32>, tensor<1x1x64x384xf32>) outs(%130 : tensor<1x68x68x384xf32>) -> tensor<1x68x68x384xf32>
    flow.return %296 : tensor<1x68x68x384xf32>
  }
  %extracted_slice_163 = tensor.extract_slice %131[0, 0, 0, 0] [1, 65, 65, 384] [1, 1, 1, 1] : tensor<1x68x68x384xf32> to tensor<65x65x384xf32>
  %132 = tensor.empty() : tensor<69x69x384xf32>
  %133 = linalg.fill ins(%cst_15 : f32) outs(%132 : tensor<69x69x384xf32>) -> tensor<69x69x384xf32>
  %134 = flow.dispatch.region -> (tensor<69x69x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_163, %cst_24 : tensor<65x65x384xf32>, tensor<384xf32>) outs(%126 : tensor<65x65x384xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x384xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %133[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
    flow.return %inserted_slice_256 : tensor<69x69x384xf32>
  }
  %expanded_164 = tensor.expand_shape %134 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %135 = tensor.empty() : tensor<1x384x69x69xf32>
  %136 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_164 : tensor<69x69x1x384xf32>) outs(%135 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %296 : tensor<1x384x69x69xf32>
  }
  %137 = tensor.empty() : tensor<1x384x65x65xf32>
  %138 = linalg.fill ins(%cst_15 : f32) outs(%137 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
  %139 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%136, %cst_64 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%138 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %296 : tensor<1x384x65x65xf32>
  }
  %140 = tensor.empty() : tensor<65x65x1x384xf32>
  %141 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%139 : tensor<1x384x65x65xf32>) outs(%140 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %296 : tensor<65x65x1x384xf32>
  }
  %collapsed_165 = tensor.collapse_shape %141 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %142 = tensor.empty() : tensor<68x68x384xf32>
  %143 = linalg.fill ins(%cst_15 : f32) outs(%142 : tensor<68x68x384xf32>) -> tensor<68x68x384xf32>
  %inserted_slice_166 = tensor.insert_slice %collapsed_165 into %143[0, 0, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<68x68x384xf32>
  %expanded_167 = tensor.expand_shape %inserted_slice_166 [[0, 1], [2], [3]] output_shape [1, 68, 68, 384] : tensor<68x68x384xf32> into tensor<1x68x68x384xf32>
  %144 = flow.dispatch.region -> (tensor<1x68x68x64xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_167, %cst_87 : tensor<1x68x68x384xf32>, tensor<1x1x384x64xf32>) outs(%121 : tensor<1x68x68x64xf32>) -> tensor<1x68x68x64xf32>
    flow.return %296 : tensor<1x68x68x64xf32>
  }
  %extracted_slice_168 = tensor.extract_slice %144[0, 0, 0, 0] [1, 65, 65, 64] [1, 1, 1, 1] : tensor<1x68x68x64xf32> to tensor<65x65x64xf32>
  %145 = flow.dispatch.region -> (tensor<65x65x64xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%125, %extracted_slice_168, %cst_65 : tensor<65x65x64xf32>, tensor<65x65x64xf32>, tensor<64xf32>) outs(%123 : tensor<65x65x64xi8>) {
    ^bb0(%in: f32, %in_256: f32, %in_257: f32, %out: i8):
      %297 = arith.addf %in_256, %in_257 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_9 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_10 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      linalg.yield %317 : i8
    } -> tensor<65x65x64xi8>
    flow.return %296 : tensor<65x65x64xi8>
  }
  %146 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%145 : tensor<65x65x64xi8>) outs(%119 : tensor<65x65x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_10 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x64xf32>
  %inserted_slice_169 = tensor.insert_slice %146 into %128[0, 0, 0] [65, 65, 64] [1, 1, 1] : tensor<65x65x64xf32> into tensor<68x68x64xf32>
  %expanded_170 = tensor.expand_shape %inserted_slice_169 [[0, 1], [2], [3]] output_shape [1, 68, 68, 64] : tensor<68x68x64xf32> into tensor<1x68x68x64xf32>
  %147 = flow.dispatch.region -> (tensor<1x68x68x384xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_170, %cst_88 : tensor<1x68x68x64xf32>, tensor<1x1x64x384xf32>) outs(%130 : tensor<1x68x68x384xf32>) -> tensor<1x68x68x384xf32>
    flow.return %296 : tensor<1x68x68x384xf32>
  }
  %extracted_slice_171 = tensor.extract_slice %147[0, 0, 0, 0] [1, 65, 65, 384] [1, 1, 1, 1] : tensor<1x68x68x384xf32> to tensor<65x65x384xf32>
  %148 = flow.dispatch.region -> (tensor<69x69x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_171, %cst_67 : tensor<65x65x384xf32>, tensor<384xf32>) outs(%126 : tensor<65x65x384xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x384xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %133[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
    flow.return %inserted_slice_256 : tensor<69x69x384xf32>
  }
  %expanded_172 = tensor.expand_shape %148 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %149 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_172 : tensor<69x69x1x384xf32>) outs(%135 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %296 : tensor<1x384x69x69xf32>
  }
  %150 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%149, %cst_66 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%138 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %296 : tensor<1x384x65x65xf32>
  }
  %151 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%150 : tensor<1x384x65x65xf32>) outs(%140 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %296 : tensor<65x65x1x384xf32>
  }
  %collapsed_173 = tensor.collapse_shape %151 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %inserted_slice_174 = tensor.insert_slice %collapsed_173 into %143[0, 0, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<68x68x384xf32>
  %expanded_175 = tensor.expand_shape %inserted_slice_174 [[0, 1], [2], [3]] output_shape [1, 68, 68, 384] : tensor<68x68x384xf32> into tensor<1x68x68x384xf32>
  %152 = flow.dispatch.region -> (tensor<1x68x68x64xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_175, %cst_89 : tensor<1x68x68x384xf32>, tensor<1x1x384x64xf32>) outs(%121 : tensor<1x68x68x64xf32>) -> tensor<1x68x68x64xf32>
    flow.return %296 : tensor<1x68x68x64xf32>
  }
  %extracted_slice_176 = tensor.extract_slice %152[0, 0, 0, 0] [1, 65, 65, 64] [1, 1, 1, 1] : tensor<1x68x68x64xf32> to tensor<65x65x64xf32>
  %153 = flow.dispatch.region -> (tensor<65x65x64xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%146, %extracted_slice_176, %cst_68 : tensor<65x65x64xf32>, tensor<65x65x64xf32>, tensor<64xf32>) outs(%123 : tensor<65x65x64xi8>) {
    ^bb0(%in: f32, %in_256: f32, %in_257: f32, %out: i8):
      %297 = arith.addf %in_256, %in_257 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_9 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_10 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      linalg.yield %317 : i8
    } -> tensor<65x65x64xi8>
    flow.return %296 : tensor<65x65x64xi8>
  }
  %154 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%153 : tensor<65x65x64xi8>) outs(%119 : tensor<65x65x64xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_10 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x64xf32>
  %inserted_slice_177 = tensor.insert_slice %154 into %128[0, 0, 0] [65, 65, 64] [1, 1, 1] : tensor<65x65x64xf32> into tensor<68x68x64xf32>
  %expanded_178 = tensor.expand_shape %inserted_slice_177 [[0, 1], [2], [3]] output_shape [1, 68, 68, 64] : tensor<68x68x64xf32> into tensor<1x68x68x64xf32>
  %155 = flow.dispatch.region -> (tensor<1x68x68x384xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_178, %cst_90 : tensor<1x68x68x64xf32>, tensor<1x1x64x384xf32>) outs(%130 : tensor<1x68x68x384xf32>) -> tensor<1x68x68x384xf32>
    flow.return %296 : tensor<1x68x68x384xf32>
  }
  %extracted_slice_179 = tensor.extract_slice %155[0, 0, 0, 0] [1, 65, 65, 384] [1, 1, 1, 1] : tensor<1x68x68x384xf32> to tensor<65x65x384xf32>
  %156 = flow.dispatch.region -> (tensor<69x69x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_179, %cst_70 : tensor<65x65x384xf32>, tensor<384xf32>) outs(%126 : tensor<65x65x384xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x384xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %133[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
    flow.return %inserted_slice_256 : tensor<69x69x384xf32>
  }
  %expanded_180 = tensor.expand_shape %156 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %157 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_180 : tensor<69x69x1x384xf32>) outs(%135 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %296 : tensor<1x384x69x69xf32>
  }
  %158 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%157, %cst_69 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%138 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %296 : tensor<1x384x65x65xf32>
  }
  %159 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%158 : tensor<1x384x65x65xf32>) outs(%140 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %296 : tensor<65x65x1x384xf32>
  }
  %collapsed_181 = tensor.collapse_shape %159 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %inserted_slice_182 = tensor.insert_slice %collapsed_181 into %143[0, 0, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<68x68x384xf32>
  %expanded_183 = tensor.expand_shape %inserted_slice_182 [[0, 1], [2], [3]] output_shape [1, 68, 68, 384] : tensor<68x68x384xf32> into tensor<1x68x68x384xf32>
  %160 = flow.dispatch.region -> (tensor<1x68x68x64xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_183, %cst_91 : tensor<1x68x68x384xf32>, tensor<1x1x384x64xf32>) outs(%121 : tensor<1x68x68x64xf32>) -> tensor<1x68x68x64xf32>
    flow.return %296 : tensor<1x68x68x64xf32>
  }
  %extracted_slice_184 = tensor.extract_slice %160[0, 0, 0, 0] [1, 65, 65, 64] [1, 1, 1, 1] : tensor<1x68x68x64xf32> to tensor<65x65x64xf32>
  %161 = flow.dispatch.region -> (tensor<68x68x64xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%154, %extracted_slice_184, %cst_71 : tensor<65x65x64xf32>, tensor<65x65x64xf32>, tensor<64xf32>) outs(%119 : tensor<65x65x64xf32>) {
    ^bb0(%in: f32, %in_257: f32, %in_258: f32, %out: f32):
      %297 = arith.addf %in_257, %in_258 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_9 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_10 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      %318 = arith.extsi %317 : i8 to i32
      %319 = arith.sitofp %318 : i32 to f32
      %320 = arith.mulf %319, %cst_10 : f32
      linalg.yield %320 : f32
    } -> tensor<65x65x64xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %128[0, 0, 0] [65, 65, 64] [1, 1, 1] : tensor<65x65x64xf32> into tensor<68x68x64xf32>
    flow.return %inserted_slice_256 : tensor<68x68x64xf32>
  }
  %expanded_185 = tensor.expand_shape %161 [[0, 1], [2], [3]] output_shape [1, 68, 68, 64] : tensor<68x68x64xf32> into tensor<1x68x68x64xf32>
  %162 = flow.dispatch.region -> (tensor<1x68x68x384xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_185, %cst_92 : tensor<1x68x68x64xf32>, tensor<1x1x64x384xf32>) outs(%130 : tensor<1x68x68x384xf32>) -> tensor<1x68x68x384xf32>
    flow.return %296 : tensor<1x68x68x384xf32>
  }
  %extracted_slice_186 = tensor.extract_slice %162[0, 0, 0, 0] [1, 65, 65, 384] [1, 1, 1, 1] : tensor<1x68x68x384xf32> to tensor<65x65x384xf32>
  %163 = flow.dispatch.region -> (tensor<69x69x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_186, %cst_33 : tensor<65x65x384xf32>, tensor<384xf32>) outs(%126 : tensor<65x65x384xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x384xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %133[2, 2, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<69x69x384xf32>
    flow.return %inserted_slice_256 : tensor<69x69x384xf32>
  }
  %expanded_187 = tensor.expand_shape %163 [[0], [1], [2, 3]] output_shape [69, 69, 1, 384] : tensor<69x69x384xf32> into tensor<69x69x1x384xf32>
  %164 = flow.dispatch.region -> (tensor<1x384x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_187 : tensor<69x69x1x384xf32>) outs(%135 : tensor<1x384x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x384x69x69xf32>
    flow.return %296 : tensor<1x384x69x69xf32>
  }
  %165 = flow.dispatch.region -> (tensor<1x384x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%164, %cst_32 : tensor<1x384x69x69xf32>, tensor<384x3x3xf32>) outs(%138 : tensor<1x384x65x65xf32>) -> tensor<1x384x65x65xf32>
    flow.return %296 : tensor<1x384x65x65xf32>
  }
  %166 = flow.dispatch.region -> (tensor<65x65x1x384xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%165 : tensor<1x384x65x65xf32>) outs(%140 : tensor<65x65x1x384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x384xf32>
    flow.return %296 : tensor<65x65x1x384xf32>
  }
  %collapsed_188 = tensor.collapse_shape %166 [[0], [1], [2, 3]] : tensor<65x65x1x384xf32> into tensor<65x65x384xf32>
  %167 = tensor.empty() : tensor<65x65x96xf32>
  %inserted_slice_189 = tensor.insert_slice %collapsed_188 into %143[0, 0, 0] [65, 65, 384] [1, 1, 1] : tensor<65x65x384xf32> into tensor<68x68x384xf32>
  %expanded_190 = tensor.expand_shape %inserted_slice_189 [[0, 1], [2], [3]] output_shape [1, 68, 68, 384] : tensor<68x68x384xf32> into tensor<1x68x68x384xf32>
  %168 = tensor.empty() : tensor<1x68x68x96xf32>
  %169 = linalg.fill ins(%cst_15 : f32) outs(%168 : tensor<1x68x68x96xf32>) -> tensor<1x68x68x96xf32>
  %170 = flow.dispatch.region -> (tensor<1x68x68x96xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_190, %cst_93 : tensor<1x68x68x384xf32>, tensor<1x1x384x96xf32>) outs(%169 : tensor<1x68x68x96xf32>) -> tensor<1x68x68x96xf32>
    flow.return %296 : tensor<1x68x68x96xf32>
  }
  %extracted_slice_191 = tensor.extract_slice %170[0, 0, 0, 0] [1, 65, 65, 96] [1, 1, 1, 1] : tensor<1x68x68x96xf32> to tensor<65x65x96xf32>
  %171 = tensor.empty() : tensor<65x65x96xi8>
  %172 = flow.dispatch.region -> (tensor<65x65x96xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_191, %cst_34 : tensor<65x65x96xf32>, tensor<96xf32>) outs(%171 : tensor<65x65x96xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<65x65x96xi8>
    flow.return %296 : tensor<65x65x96xi8>
  }
  %173 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%172 : tensor<65x65x96xi8>) outs(%167 : tensor<65x65x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_9 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x96xf32>
  %174 = tensor.empty() : tensor<65x65x576xf32>
  %175 = tensor.empty() : tensor<68x68x96xf32>
  %176 = linalg.fill ins(%cst_15 : f32) outs(%175 : tensor<68x68x96xf32>) -> tensor<68x68x96xf32>
  %inserted_slice_192 = tensor.insert_slice %173 into %176[0, 0, 0] [65, 65, 96] [1, 1, 1] : tensor<65x65x96xf32> into tensor<68x68x96xf32>
  %expanded_193 = tensor.expand_shape %inserted_slice_192 [[0, 1], [2], [3]] output_shape [1, 68, 68, 96] : tensor<68x68x96xf32> into tensor<1x68x68x96xf32>
  %177 = tensor.empty() : tensor<1x68x68x576xf32>
  %178 = linalg.fill ins(%cst_15 : f32) outs(%177 : tensor<1x68x68x576xf32>) -> tensor<1x68x68x576xf32>
  %179 = flow.dispatch.region -> (tensor<1x68x68x576xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_193, %cst_94 : tensor<1x68x68x96xf32>, tensor<1x1x96x576xf32>) outs(%178 : tensor<1x68x68x576xf32>) -> tensor<1x68x68x576xf32>
    flow.return %296 : tensor<1x68x68x576xf32>
  }
  %extracted_slice_194 = tensor.extract_slice %179[0, 0, 0, 0] [1, 65, 65, 576] [1, 1, 1, 1] : tensor<1x68x68x576xf32> to tensor<65x65x576xf32>
  %180 = tensor.empty() : tensor<69x69x576xf32>
  %181 = linalg.fill ins(%cst_15 : f32) outs(%180 : tensor<69x69x576xf32>) -> tensor<69x69x576xf32>
  %182 = flow.dispatch.region -> (tensor<69x69x576xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_194, %cst_20 : tensor<65x65x576xf32>, tensor<576xf32>) outs(%174 : tensor<65x65x576xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x576xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %181[2, 2, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<69x69x576xf32>
    flow.return %inserted_slice_256 : tensor<69x69x576xf32>
  }
  %expanded_195 = tensor.expand_shape %182 [[0], [1], [2, 3]] output_shape [69, 69, 1, 576] : tensor<69x69x576xf32> into tensor<69x69x1x576xf32>
  %183 = tensor.empty() : tensor<1x576x69x69xf32>
  %184 = flow.dispatch.region -> (tensor<1x576x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_195 : tensor<69x69x1x576xf32>) outs(%183 : tensor<1x576x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x576x69x69xf32>
    flow.return %296 : tensor<1x576x69x69xf32>
  }
  %185 = tensor.empty() : tensor<1x576x65x65xf32>
  %186 = linalg.fill ins(%cst_15 : f32) outs(%185 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
  %187 = flow.dispatch.region -> (tensor<1x576x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%184, %cst_35 : tensor<1x576x69x69xf32>, tensor<576x3x3xf32>) outs(%186 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
    flow.return %296 : tensor<1x576x65x65xf32>
  }
  %188 = tensor.empty() : tensor<65x65x1x576xf32>
  %189 = flow.dispatch.region -> (tensor<65x65x1x576xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%187 : tensor<1x576x65x65xf32>) outs(%188 : tensor<65x65x1x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x576xf32>
    flow.return %296 : tensor<65x65x1x576xf32>
  }
  %collapsed_196 = tensor.collapse_shape %189 [[0], [1], [2, 3]] : tensor<65x65x1x576xf32> into tensor<65x65x576xf32>
  %190 = tensor.empty() : tensor<68x68x576xf32>
  %191 = linalg.fill ins(%cst_15 : f32) outs(%190 : tensor<68x68x576xf32>) -> tensor<68x68x576xf32>
  %inserted_slice_197 = tensor.insert_slice %collapsed_196 into %191[0, 0, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<68x68x576xf32>
  %expanded_198 = tensor.expand_shape %inserted_slice_197 [[0, 1], [2], [3]] output_shape [1, 68, 68, 576] : tensor<68x68x576xf32> into tensor<1x68x68x576xf32>
  %192 = flow.dispatch.region -> (tensor<1x68x68x96xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_198, %cst_95 : tensor<1x68x68x576xf32>, tensor<1x1x576x96xf32>) outs(%169 : tensor<1x68x68x96xf32>) -> tensor<1x68x68x96xf32>
    flow.return %296 : tensor<1x68x68x96xf32>
  }
  %extracted_slice_199 = tensor.extract_slice %192[0, 0, 0, 0] [1, 65, 65, 96] [1, 1, 1, 1] : tensor<1x68x68x96xf32> to tensor<65x65x96xf32>
  %193 = flow.dispatch.region -> (tensor<65x65x96xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%173, %extracted_slice_199, %cst_36 : tensor<65x65x96xf32>, tensor<65x65x96xf32>, tensor<96xf32>) outs(%171 : tensor<65x65x96xi8>) {
    ^bb0(%in: f32, %in_256: f32, %in_257: f32, %out: i8):
      %297 = arith.addf %in_256, %in_257 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_9 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_9 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      linalg.yield %317 : i8
    } -> tensor<65x65x96xi8>
    flow.return %296 : tensor<65x65x96xi8>
  }
  %194 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%193 : tensor<65x65x96xi8>) outs(%167 : tensor<65x65x96xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_9 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x96xf32>
  %inserted_slice_200 = tensor.insert_slice %194 into %176[0, 0, 0] [65, 65, 96] [1, 1, 1] : tensor<65x65x96xf32> into tensor<68x68x96xf32>
  %expanded_201 = tensor.expand_shape %inserted_slice_200 [[0, 1], [2], [3]] output_shape [1, 68, 68, 96] : tensor<68x68x96xf32> into tensor<1x68x68x96xf32>
  %195 = flow.dispatch.region -> (tensor<1x68x68x576xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_201, %cst_96 : tensor<1x68x68x96xf32>, tensor<1x1x96x576xf32>) outs(%178 : tensor<1x68x68x576xf32>) -> tensor<1x68x68x576xf32>
    flow.return %296 : tensor<1x68x68x576xf32>
  }
  %extracted_slice_202 = tensor.extract_slice %195[0, 0, 0, 0] [1, 65, 65, 576] [1, 1, 1, 1] : tensor<1x68x68x576xf32> to tensor<65x65x576xf32>
  %196 = flow.dispatch.region -> (tensor<69x69x576xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_202, %cst_38 : tensor<65x65x576xf32>, tensor<576xf32>) outs(%174 : tensor<65x65x576xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x576xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %181[2, 2, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<69x69x576xf32>
    flow.return %inserted_slice_256 : tensor<69x69x576xf32>
  }
  %expanded_203 = tensor.expand_shape %196 [[0], [1], [2, 3]] output_shape [69, 69, 1, 576] : tensor<69x69x576xf32> into tensor<69x69x1x576xf32>
  %197 = flow.dispatch.region -> (tensor<1x576x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_203 : tensor<69x69x1x576xf32>) outs(%183 : tensor<1x576x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x576x69x69xf32>
    flow.return %296 : tensor<1x576x69x69xf32>
  }
  %198 = flow.dispatch.region -> (tensor<1x576x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%197, %cst_37 : tensor<1x576x69x69xf32>, tensor<576x3x3xf32>) outs(%186 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
    flow.return %296 : tensor<1x576x65x65xf32>
  }
  %199 = flow.dispatch.region -> (tensor<65x65x1x576xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%198 : tensor<1x576x65x65xf32>) outs(%188 : tensor<65x65x1x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x576xf32>
    flow.return %296 : tensor<65x65x1x576xf32>
  }
  %collapsed_204 = tensor.collapse_shape %199 [[0], [1], [2, 3]] : tensor<65x65x1x576xf32> into tensor<65x65x576xf32>
  %inserted_slice_205 = tensor.insert_slice %collapsed_204 into %191[0, 0, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<68x68x576xf32>
  %expanded_206 = tensor.expand_shape %inserted_slice_205 [[0, 1], [2], [3]] output_shape [1, 68, 68, 576] : tensor<68x68x576xf32> into tensor<1x68x68x576xf32>
  %200 = flow.dispatch.region -> (tensor<1x68x68x96xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_206, %cst_97 : tensor<1x68x68x576xf32>, tensor<1x1x576x96xf32>) outs(%169 : tensor<1x68x68x96xf32>) -> tensor<1x68x68x96xf32>
    flow.return %296 : tensor<1x68x68x96xf32>
  }
  %extracted_slice_207 = tensor.extract_slice %200[0, 0, 0, 0] [1, 65, 65, 96] [1, 1, 1, 1] : tensor<1x68x68x96xf32> to tensor<65x65x96xf32>
  %201 = flow.dispatch.region -> (tensor<68x68x96xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%194, %extracted_slice_207, %cst_39 : tensor<65x65x96xf32>, tensor<65x65x96xf32>, tensor<96xf32>) outs(%167 : tensor<65x65x96xf32>) {
    ^bb0(%in: f32, %in_257: f32, %in_258: f32, %out: f32):
      %297 = arith.addf %in_257, %in_258 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_9 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_9 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      %318 = arith.extsi %317 : i8 to i32
      %319 = arith.sitofp %318 : i32 to f32
      %320 = arith.mulf %319, %cst_9 : f32
      linalg.yield %320 : f32
    } -> tensor<65x65x96xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %176[0, 0, 0] [65, 65, 96] [1, 1, 1] : tensor<65x65x96xf32> into tensor<68x68x96xf32>
    flow.return %inserted_slice_256 : tensor<68x68x96xf32>
  }
  %expanded_208 = tensor.expand_shape %201 [[0, 1], [2], [3]] output_shape [1, 68, 68, 96] : tensor<68x68x96xf32> into tensor<1x68x68x96xf32>
  %202 = flow.dispatch.region -> (tensor<1x68x68x576xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_208, %cst_98 : tensor<1x68x68x96xf32>, tensor<1x1x96x576xf32>) outs(%178 : tensor<1x68x68x576xf32>) -> tensor<1x68x68x576xf32>
    flow.return %296 : tensor<1x68x68x576xf32>
  }
  %extracted_slice_209 = tensor.extract_slice %202[0, 0, 0, 0] [1, 65, 65, 576] [1, 1, 1, 1] : tensor<1x68x68x576xf32> to tensor<65x65x576xf32>
  %203 = flow.dispatch.region -> (tensor<69x69x576xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_209, %cst_41 : tensor<65x65x576xf32>, tensor<576xf32>) outs(%174 : tensor<65x65x576xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x576xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %181[2, 2, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<69x69x576xf32>
    flow.return %inserted_slice_256 : tensor<69x69x576xf32>
  }
  %expanded_210 = tensor.expand_shape %203 [[0], [1], [2, 3]] output_shape [69, 69, 1, 576] : tensor<69x69x576xf32> into tensor<69x69x1x576xf32>
  %204 = flow.dispatch.region -> (tensor<1x576x69x69xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_210 : tensor<69x69x1x576xf32>) outs(%183 : tensor<1x576x69x69xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x576x69x69xf32>
    flow.return %296 : tensor<1x576x69x69xf32>
  }
  %205 = flow.dispatch.region -> (tensor<1x576x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<2> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%204, %cst_40 : tensor<1x576x69x69xf32>, tensor<576x3x3xf32>) outs(%186 : tensor<1x576x65x65xf32>) -> tensor<1x576x65x65xf32>
    flow.return %296 : tensor<1x576x65x65xf32>
  }
  %206 = flow.dispatch.region -> (tensor<65x65x1x576xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%205 : tensor<1x576x65x65xf32>) outs(%188 : tensor<65x65x1x576xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x576xf32>
    flow.return %296 : tensor<65x65x1x576xf32>
  }
  %collapsed_211 = tensor.collapse_shape %206 [[0], [1], [2, 3]] : tensor<65x65x1x576xf32> into tensor<65x65x576xf32>
  %207 = tensor.empty() : tensor<65x65x160xf32>
  %inserted_slice_212 = tensor.insert_slice %collapsed_211 into %191[0, 0, 0] [65, 65, 576] [1, 1, 1] : tensor<65x65x576xf32> into tensor<68x68x576xf32>
  %expanded_213 = tensor.expand_shape %inserted_slice_212 [[0, 1], [2], [3]] output_shape [1, 68, 68, 576] : tensor<68x68x576xf32> into tensor<1x68x68x576xf32>
  %208 = tensor.empty() : tensor<1x68x68x160xf32>
  %209 = linalg.fill ins(%cst_15 : f32) outs(%208 : tensor<1x68x68x160xf32>) -> tensor<1x68x68x160xf32>
  %210 = flow.dispatch.region -> (tensor<1x68x68x160xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_213, %cst_99 : tensor<1x68x68x576xf32>, tensor<1x1x576x160xf32>) outs(%209 : tensor<1x68x68x160xf32>) -> tensor<1x68x68x160xf32>
    flow.return %296 : tensor<1x68x68x160xf32>
  }
  %extracted_slice_214 = tensor.extract_slice %210[0, 0, 0, 0] [1, 65, 65, 160] [1, 1, 1, 1] : tensor<1x68x68x160xf32> to tensor<65x65x160xf32>
  %211 = tensor.empty() : tensor<65x65x160xi8>
  %212 = flow.dispatch.region -> (tensor<65x65x160xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_214, %cst_42 : tensor<65x65x160xf32>, tensor<160xf32>) outs(%211 : tensor<65x65x160xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<65x65x160xi8>
    flow.return %296 : tensor<65x65x160xi8>
  }
  %213 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%212 : tensor<65x65x160xi8>) outs(%207 : tensor<65x65x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_9 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x160xf32>
  %214 = tensor.empty() : tensor<65x65x960xf32>
  %215 = tensor.empty() : tensor<68x68x160xf32>
  %216 = linalg.fill ins(%cst_15 : f32) outs(%215 : tensor<68x68x160xf32>) -> tensor<68x68x160xf32>
  %inserted_slice_215 = tensor.insert_slice %213 into %216[0, 0, 0] [65, 65, 160] [1, 1, 1] : tensor<65x65x160xf32> into tensor<68x68x160xf32>
  %expanded_216 = tensor.expand_shape %inserted_slice_215 [[0, 1], [2], [3]] output_shape [1, 68, 68, 160] : tensor<68x68x160xf32> into tensor<1x68x68x160xf32>
  %217 = tensor.empty() : tensor<1x68x68x960xf32>
  %218 = linalg.fill ins(%cst_15 : f32) outs(%217 : tensor<1x68x68x960xf32>) -> tensor<1x68x68x960xf32>
  %219 = flow.dispatch.region -> (tensor<1x68x68x960xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_216, %cst_100 : tensor<1x68x68x160xf32>, tensor<1x1x160x960xf32>) outs(%218 : tensor<1x68x68x960xf32>) -> tensor<1x68x68x960xf32>
    flow.return %296 : tensor<1x68x68x960xf32>
  }
  %extracted_slice_217 = tensor.extract_slice %219[0, 0, 0, 0] [1, 65, 65, 960] [1, 1, 1, 1] : tensor<1x68x68x960xf32> to tensor<65x65x960xf32>
  %220 = tensor.empty() : tensor<73x73x960xf32>
  %221 = linalg.fill ins(%cst_15 : f32) outs(%220 : tensor<73x73x960xf32>) -> tensor<73x73x960xf32>
  %222 = flow.dispatch.region -> (tensor<73x73x960xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_217, %cst_21 : tensor<65x65x960xf32>, tensor<960xf32>) outs(%214 : tensor<65x65x960xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x960xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %221[4, 4, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<73x73x960xf32>
    flow.return %inserted_slice_256 : tensor<73x73x960xf32>
  }
  %expanded_218 = tensor.expand_shape %222 [[0], [1], [2, 3]] output_shape [73, 73, 1, 960] : tensor<73x73x960xf32> into tensor<73x73x1x960xf32>
  %223 = tensor.empty() : tensor<1x960x73x73xf32>
  %224 = flow.dispatch.region -> (tensor<1x960x73x73xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_218 : tensor<73x73x1x960xf32>) outs(%223 : tensor<1x960x73x73xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x960x73x73xf32>
    flow.return %296 : tensor<1x960x73x73xf32>
  }
  %225 = tensor.empty() : tensor<1x960x65x65xf32>
  %226 = linalg.fill ins(%cst_15 : f32) outs(%225 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
  %227 = flow.dispatch.region -> (tensor<1x960x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<4> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%224, %cst_43 : tensor<1x960x73x73xf32>, tensor<960x3x3xf32>) outs(%226 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
    flow.return %296 : tensor<1x960x65x65xf32>
  }
  %228 = tensor.empty() : tensor<65x65x1x960xf32>
  %229 = flow.dispatch.region -> (tensor<65x65x1x960xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%227 : tensor<1x960x65x65xf32>) outs(%228 : tensor<65x65x1x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x960xf32>
    flow.return %296 : tensor<65x65x1x960xf32>
  }
  %collapsed_219 = tensor.collapse_shape %229 [[0], [1], [2, 3]] : tensor<65x65x1x960xf32> into tensor<65x65x960xf32>
  %230 = tensor.empty() : tensor<68x68x960xf32>
  %231 = linalg.fill ins(%cst_15 : f32) outs(%230 : tensor<68x68x960xf32>) -> tensor<68x68x960xf32>
  %inserted_slice_220 = tensor.insert_slice %collapsed_219 into %231[0, 0, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<68x68x960xf32>
  %expanded_221 = tensor.expand_shape %inserted_slice_220 [[0, 1], [2], [3]] output_shape [1, 68, 68, 960] : tensor<68x68x960xf32> into tensor<1x68x68x960xf32>
  %232 = flow.dispatch.region -> (tensor<1x68x68x160xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_221, %cst_101 : tensor<1x68x68x960xf32>, tensor<1x1x960x160xf32>) outs(%209 : tensor<1x68x68x160xf32>) -> tensor<1x68x68x160xf32>
    flow.return %296 : tensor<1x68x68x160xf32>
  }
  %extracted_slice_222 = tensor.extract_slice %232[0, 0, 0, 0] [1, 65, 65, 160] [1, 1, 1, 1] : tensor<1x68x68x160xf32> to tensor<65x65x160xf32>
  %233 = flow.dispatch.region -> (tensor<65x65x160xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%213, %extracted_slice_222, %cst_44 : tensor<65x65x160xf32>, tensor<65x65x160xf32>, tensor<160xf32>) outs(%211 : tensor<65x65x160xi8>) {
    ^bb0(%in: f32, %in_256: f32, %in_257: f32, %out: i8):
      %297 = arith.addf %in_256, %in_257 : f32
      %298 = arith.divf %297, %cst_11 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_11 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_9 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      linalg.yield %317 : i8
    } -> tensor<65x65x160xi8>
    flow.return %296 : tensor<65x65x160xi8>
  }
  %234 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%233 : tensor<65x65x160xi8>) outs(%207 : tensor<65x65x160xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_9 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x160xf32>
  %inserted_slice_223 = tensor.insert_slice %234 into %216[0, 0, 0] [65, 65, 160] [1, 1, 1] : tensor<65x65x160xf32> into tensor<68x68x160xf32>
  %expanded_224 = tensor.expand_shape %inserted_slice_223 [[0, 1], [2], [3]] output_shape [1, 68, 68, 160] : tensor<68x68x160xf32> into tensor<1x68x68x160xf32>
  %235 = flow.dispatch.region -> (tensor<1x68x68x960xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_224, %cst_102 : tensor<1x68x68x160xf32>, tensor<1x1x160x960xf32>) outs(%218 : tensor<1x68x68x960xf32>) -> tensor<1x68x68x960xf32>
    flow.return %296 : tensor<1x68x68x960xf32>
  }
  %extracted_slice_225 = tensor.extract_slice %235[0, 0, 0, 0] [1, 65, 65, 960] [1, 1, 1, 1] : tensor<1x68x68x960xf32> to tensor<65x65x960xf32>
  %236 = flow.dispatch.region -> (tensor<73x73x960xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_225, %cst_46 : tensor<65x65x960xf32>, tensor<960xf32>) outs(%214 : tensor<65x65x960xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x960xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %221[4, 4, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<73x73x960xf32>
    flow.return %inserted_slice_256 : tensor<73x73x960xf32>
  }
  %expanded_226 = tensor.expand_shape %236 [[0], [1], [2, 3]] output_shape [73, 73, 1, 960] : tensor<73x73x960xf32> into tensor<73x73x1x960xf32>
  %237 = flow.dispatch.region -> (tensor<1x960x73x73xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_226 : tensor<73x73x1x960xf32>) outs(%223 : tensor<1x960x73x73xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x960x73x73xf32>
    flow.return %296 : tensor<1x960x73x73xf32>
  }
  %238 = flow.dispatch.region -> (tensor<1x960x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<4> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%237, %cst_45 : tensor<1x960x73x73xf32>, tensor<960x3x3xf32>) outs(%226 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
    flow.return %296 : tensor<1x960x65x65xf32>
  }
  %239 = flow.dispatch.region -> (tensor<65x65x1x960xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%238 : tensor<1x960x65x65xf32>) outs(%228 : tensor<65x65x1x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x960xf32>
    flow.return %296 : tensor<65x65x1x960xf32>
  }
  %collapsed_227 = tensor.collapse_shape %239 [[0], [1], [2, 3]] : tensor<65x65x1x960xf32> into tensor<65x65x960xf32>
  %inserted_slice_228 = tensor.insert_slice %collapsed_227 into %231[0, 0, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<68x68x960xf32>
  %expanded_229 = tensor.expand_shape %inserted_slice_228 [[0, 1], [2], [3]] output_shape [1, 68, 68, 960] : tensor<68x68x960xf32> into tensor<1x68x68x960xf32>
  %240 = flow.dispatch.region -> (tensor<1x68x68x160xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_229, %cst_103 : tensor<1x68x68x960xf32>, tensor<1x1x960x160xf32>) outs(%209 : tensor<1x68x68x160xf32>) -> tensor<1x68x68x160xf32>
    flow.return %296 : tensor<1x68x68x160xf32>
  }
  %extracted_slice_230 = tensor.extract_slice %240[0, 0, 0, 0] [1, 65, 65, 160] [1, 1, 1, 1] : tensor<1x68x68x160xf32> to tensor<65x65x160xf32>
  %241 = flow.dispatch.region -> (tensor<68x68x160xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%234, %extracted_slice_230, %cst_47 : tensor<65x65x160xf32>, tensor<65x65x160xf32>, tensor<160xf32>) outs(%207 : tensor<65x65x160xf32>) {
    ^bb0(%in: f32, %in_257: f32, %in_258: f32, %out: f32):
      %297 = arith.addf %in_257, %in_258 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      %306 = arith.extsi %305 : i8 to i32
      %307 = arith.sitofp %306 : i32 to f32
      %308 = arith.mulf %307, %cst_9 : f32
      %309 = arith.addf %in, %308 : f32
      %310 = arith.divf %309, %cst_9 : f32
      %311 = math.round %310 : f32
      %312 = arith.addf %311, %cst_15 : f32
      %313 = arith.cmpf ult, %312, %cst_17 : f32
      %314 = arith.cmpf ugt, %312, %cst_16 : f32
      %315 = arith.select %313, %cst_17, %312 : f32
      %316 = arith.select %314, %cst_16, %315 : f32
      %317 = arith.fptosi %316 : f32 to i8
      %318 = arith.extsi %317 : i8 to i32
      %319 = arith.sitofp %318 : i32 to f32
      %320 = arith.mulf %319, %cst_9 : f32
      linalg.yield %320 : f32
    } -> tensor<65x65x160xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %216[0, 0, 0] [65, 65, 160] [1, 1, 1] : tensor<65x65x160xf32> into tensor<68x68x160xf32>
    flow.return %inserted_slice_256 : tensor<68x68x160xf32>
  }
  %expanded_231 = tensor.expand_shape %241 [[0, 1], [2], [3]] output_shape [1, 68, 68, 160] : tensor<68x68x160xf32> into tensor<1x68x68x160xf32>
  %242 = flow.dispatch.region -> (tensor<1x68x68x960xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_231, %cst_104 : tensor<1x68x68x160xf32>, tensor<1x1x160x960xf32>) outs(%218 : tensor<1x68x68x960xf32>) -> tensor<1x68x68x960xf32>
    flow.return %296 : tensor<1x68x68x960xf32>
  }
  %extracted_slice_232 = tensor.extract_slice %242[0, 0, 0, 0] [1, 65, 65, 960] [1, 1, 1, 1] : tensor<1x68x68x960xf32> to tensor<65x65x960xf32>
  %243 = flow.dispatch.region -> (tensor<73x73x960xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_232, %cst_49 : tensor<65x65x960xf32>, tensor<960xf32>) outs(%214 : tensor<65x65x960xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ult, %297, %cst_15 : f32
      %299 = arith.select %298, %cst_15, %297 : f32
      %300 = arith.cmpf ugt, %299, %cst_2 : f32
      %301 = arith.select %300, %cst_2, %299 : f32
      %302 = arith.divf %301, %cst_11 : f32
      %303 = math.round %302 : f32
      %304 = arith.addf %303, %cst_15 : f32
      %305 = arith.cmpf ult, %304, %cst_17 : f32
      %306 = arith.cmpf ugt, %304, %cst_16 : f32
      %307 = arith.select %305, %cst_17, %304 : f32
      %308 = arith.select %306, %cst_16, %307 : f32
      %309 = arith.fptosi %308 : f32 to i8
      %310 = arith.extsi %309 : i8 to i32
      %311 = arith.sitofp %310 : i32 to f32
      %312 = arith.mulf %311, %cst_11 : f32
      linalg.yield %312 : f32
    } -> tensor<65x65x960xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %221[4, 4, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<73x73x960xf32>
    flow.return %inserted_slice_256 : tensor<73x73x960xf32>
  }
  %expanded_233 = tensor.expand_shape %243 [[0], [1], [2, 3]] output_shape [73, 73, 1, 960] : tensor<73x73x960xf32> into tensor<73x73x1x960xf32>
  %244 = flow.dispatch.region -> (tensor<1x960x73x73xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_233 : tensor<73x73x1x960xf32>) outs(%223 : tensor<1x960x73x73xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<1x960x73x73xf32>
    flow.return %296 : tensor<1x960x73x73xf32>
  }
  %245 = flow.dispatch.region -> (tensor<1x960x65x65xf32>) {
    %296 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<4> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%244, %cst_48 : tensor<1x960x73x73xf32>, tensor<960x3x3xf32>) outs(%226 : tensor<1x960x65x65xf32>) -> tensor<1x960x65x65xf32>
    flow.return %296 : tensor<1x960x65x65xf32>
  }
  %246 = flow.dispatch.region -> (tensor<65x65x1x960xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%245 : tensor<1x960x65x65xf32>) outs(%228 : tensor<65x65x1x960xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.cmpf ult, %in, %cst_15 : f32
      %298 = arith.select %297, %cst_15, %in : f32
      %299 = arith.cmpf ugt, %298, %cst_2 : f32
      %300 = arith.select %299, %cst_2, %298 : f32
      %301 = arith.divf %300, %cst_11 : f32
      %302 = math.round %301 : f32
      %303 = arith.addf %302, %cst_15 : f32
      %304 = arith.cmpf ult, %303, %cst_17 : f32
      %305 = arith.cmpf ugt, %303, %cst_16 : f32
      %306 = arith.select %304, %cst_17, %303 : f32
      %307 = arith.select %305, %cst_16, %306 : f32
      %308 = arith.fptosi %307 : f32 to i8
      %309 = arith.extsi %308 : i8 to i32
      %310 = arith.sitofp %309 : i32 to f32
      %311 = arith.mulf %310, %cst_11 : f32
      linalg.yield %311 : f32
    } -> tensor<65x65x1x960xf32>
    flow.return %296 : tensor<65x65x1x960xf32>
  }
  %collapsed_234 = tensor.collapse_shape %246 [[0], [1], [2, 3]] : tensor<65x65x1x960xf32> into tensor<65x65x960xf32>
  %247 = tensor.empty() : tensor<65x65x320xf32>
  %inserted_slice_235 = tensor.insert_slice %collapsed_234 into %231[0, 0, 0] [65, 65, 960] [1, 1, 1] : tensor<65x65x960xf32> into tensor<68x68x960xf32>
  %expanded_236 = tensor.expand_shape %inserted_slice_235 [[0, 1], [2], [3]] output_shape [1, 68, 68, 960] : tensor<68x68x960xf32> into tensor<1x68x68x960xf32>
  %248 = tensor.empty() : tensor<1x68x68x320xf32>
  %249 = linalg.fill ins(%cst_15 : f32) outs(%248 : tensor<1x68x68x320xf32>) -> tensor<1x68x68x320xf32>
  %250 = flow.dispatch.region -> (tensor<1x68x68x320xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_236, %cst_105 : tensor<1x68x68x960xf32>, tensor<1x1x960x320xf32>) outs(%249 : tensor<1x68x68x320xf32>) -> tensor<1x68x68x320xf32>
    flow.return %296 : tensor<1x68x68x320xf32>
  }
  %extracted_slice_237 = tensor.extract_slice %250[0, 0, 0, 0] [1, 65, 65, 320] [1, 1, 1, 1] : tensor<1x68x68x320xf32> to tensor<65x65x320xf32>
  %251 = tensor.empty() : tensor<65x65x320xi8>
  %252 = flow.dispatch.region -> (tensor<65x65x320xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_237, %cst_50 : tensor<65x65x320xf32>, tensor<320xf32>) outs(%251 : tensor<65x65x320xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_9 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<65x65x320xi8>
    flow.return %296 : tensor<65x65x320xi8>
  }
  %253 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%252 : tensor<65x65x320xi8>) outs(%247 : tensor<65x65x320xf32>) {
  ^bb0(%in: i8, %out: f32):
    %296 = arith.extsi %in : i8 to i32
    %297 = arith.sitofp %296 : i32 to f32
    %298 = arith.mulf %297, %cst_9 : f32
    linalg.yield %298 : f32
  } -> tensor<65x65x320xf32>
  %254 = tensor.empty() : tensor<320x65x65xf32>
  %255 = flow.dispatch.region -> (tensor<320x65x65xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%253 : tensor<65x65x320xf32>) outs(%254 : tensor<320x65x65xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<320x65x65xf32>
    flow.return %296 : tensor<320x65x65xf32>
  }
  %expanded_238 = tensor.expand_shape %255 [[0, 1], [2], [3]] output_shape [1, 320, 65, 65] : tensor<320x65x65xf32> into tensor<1x320x65x65xf32>
  %256 = tensor.empty() : tensor<1x320x1x1xf32>
  %257 = linalg.fill ins(%cst_15 : f32) outs(%256 : tensor<1x320x1x1xf32>) -> tensor<1x320x1x1xf32>
  %258 = tensor.empty() : tensor<65x65xf32>
  %259 = flow.dispatch.region -> (tensor<1x320x1x1xf32>) {
    %296 = linalg.pooling_nchw_sum {dilations = dense<1> : vector<2xi64>, strides = dense<65> : vector<2xi64>} ins(%expanded_238, %258 : tensor<1x320x65x65xf32>, tensor<65x65xf32>) outs(%257 : tensor<1x320x1x1xf32>) -> tensor<1x320x1x1xf32>
    flow.return %296 : tensor<1x320x1x1xf32>
  }
  %collapsed_239 = tensor.collapse_shape %259 [[0, 1, 2, 3]] : tensor<1x320x1x1xf32> into tensor<320xf32>
  %260 = tensor.empty() : tensor<65x65x256xf32>
  %261 = tensor.empty() : tensor<68x68x320xf32>
  %262 = linalg.fill ins(%cst_15 : f32) outs(%261 : tensor<68x68x320xf32>) -> tensor<68x68x320xf32>
  %inserted_slice_240 = tensor.insert_slice %253 into %262[0, 0, 0] [65, 65, 320] [1, 1, 1] : tensor<65x65x320xf32> into tensor<68x68x320xf32>
  %expanded_241 = tensor.expand_shape %inserted_slice_240 [[0, 1], [2], [3]] output_shape [1, 68, 68, 320] : tensor<68x68x320xf32> into tensor<1x68x68x320xf32>
  %263 = tensor.empty() : tensor<1x68x68x256xf32>
  %264 = linalg.fill ins(%cst_15 : f32) outs(%263 : tensor<1x68x68x256xf32>) -> tensor<1x68x68x256xf32>
  %265 = flow.dispatch.region -> (tensor<1x68x68x256xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_241, %cst_106 : tensor<1x68x68x320xf32>, tensor<1x1x320x256xf32>) outs(%264 : tensor<1x68x68x256xf32>) -> tensor<1x68x68x256xf32>
    flow.return %296 : tensor<1x68x68x256xf32>
  }
  %extracted_slice_242 = tensor.extract_slice %265[0, 0, 0, 0] [1, 65, 65, 256] [1, 1, 1, 1] : tensor<1x68x68x256xf32> to tensor<65x65x256xf32>
  %expanded_243 = tensor.expand_shape %collapsed_239 [[0, 1, 2]] output_shape [1, 1, 320] : tensor<320xf32> into tensor<1x1x320xf32>
  %266 = tensor.empty() : tensor<1x1x320xf32>
  %267 = tensor.empty() : tensor<256x65x65xf32>
  %268 = flow.dispatch.region -> (tensor<256x65x65xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_242, %cst_25 : tensor<65x65x256xf32>, tensor<256xf32>) outs(%267 : tensor<256x65x65xf32>) {
    ^bb0(%in: f32, %in_256: f32, %out: f32):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.cmpf ugt, %297, %cst_15 : f32
      %299 = arith.select %298, %297, %cst_15 : f32
      %300 = arith.divf %299, %cst_12 : f32
      %301 = math.round %300 : f32
      %302 = arith.addf %301, %cst_15 : f32
      %303 = arith.cmpf ult, %302, %cst_17 : f32
      %304 = arith.cmpf ugt, %302, %cst_16 : f32
      %305 = arith.select %303, %cst_17, %302 : f32
      %306 = arith.select %304, %cst_16, %305 : f32
      %307 = arith.fptosi %306 : f32 to i8
      %308 = arith.extsi %307 : i8 to i32
      %309 = arith.sitofp %308 : i32 to f32
      %310 = arith.mulf %309, %cst_12 : f32
      linalg.yield %310 : f32
    } -> tensor<256x65x65xf32>
    flow.return %296 : tensor<256x65x65xf32>
  }
  %269 = tensor.empty() : tensor<4x4x320xf32>
  %270 = linalg.fill ins(%cst_15 : f32) outs(%269 : tensor<4x4x320xf32>) -> tensor<4x4x320xf32>
  %271 = flow.dispatch.region -> (tensor<4x4x320xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_243 : tensor<1x1x320xf32>) outs(%266 : tensor<1x1x320xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.divf %in, %cst_18 : f32
      %298 = arith.mulf %297, %cst_1 : f32
      %299 = arith.divf %298, %cst_9 : f32
      %300 = math.round %299 : f32
      %301 = arith.addf %300, %cst_15 : f32
      %302 = arith.cmpf ult, %301, %cst_17 : f32
      %303 = arith.cmpf ugt, %301, %cst_16 : f32
      %304 = arith.select %302, %cst_17, %301 : f32
      %305 = arith.select %303, %cst_16, %304 : f32
      %306 = arith.fptosi %305 : f32 to i8
      %307 = arith.extsi %306 : i8 to i32
      %308 = arith.sitofp %307 : i32 to f32
      %309 = arith.mulf %308, %cst_9 : f32
      linalg.yield %309 : f32
    } -> tensor<1x1x320xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %270[0, 0, 0] [1, 1, 320] [1, 1, 1] : tensor<1x1x320xf32> into tensor<4x4x320xf32>
    flow.return %inserted_slice_256 : tensor<4x4x320xf32>
  }
  %expanded_244 = tensor.expand_shape %271 [[0, 1], [2], [3]] output_shape [1, 4, 4, 320] : tensor<4x4x320xf32> into tensor<1x4x4x320xf32>
  %272 = tensor.empty() : tensor<1x4x4x256xf32>
  %273 = linalg.fill ins(%cst_15 : f32) outs(%272 : tensor<1x4x4x256xf32>) -> tensor<1x4x4x256xf32>
  %274 = flow.dispatch.region -> (tensor<1x4x4x256xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_244, %cst_107 : tensor<1x4x4x320xf32>, tensor<1x1x320x256xf32>) outs(%273 : tensor<1x4x4x256xf32>) -> tensor<1x4x4x256xf32>
    flow.return %296 : tensor<1x4x4x256xf32>
  }
  %extracted_slice_245 = tensor.extract_slice %274[0, 0, 0, 0] [1, 1, 1, 256] [1, 1, 1, 1] : tensor<1x4x4x256xf32> to tensor<256xf32>
  %expanded_246 = tensor.expand_shape %extracted_slice_245 [[0, 1, 2, 3]] output_shape [1, 256, 1, 1] : tensor<256xf32> into tensor<1x256x1x1xf32>
  %275 = tensor.empty() : tensor<1x256x1x1xf32>
  %276 = flow.dispatch.region -> (tensor<1x256x1x1xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_246, %cst_0 : tensor<1x256x1x1xf32>, tensor<1x256x1x1xf32>) outs(%275 : tensor<1x256x1x1xf32>) {
    ^bb0(%in: f32, %in_256: f32, %out: f32):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.cmpf ugt, %297, %cst_15 : f32
      %299 = arith.select %298, %297, %cst_15 : f32
      %300 = arith.divf %299, %cst_12 : f32
      %301 = math.round %300 : f32
      %302 = arith.addf %301, %cst_15 : f32
      %303 = arith.cmpf ult, %302, %cst_17 : f32
      %304 = arith.cmpf ugt, %302, %cst_16 : f32
      %305 = arith.select %303, %cst_17, %302 : f32
      %306 = arith.select %304, %cst_16, %305 : f32
      %307 = arith.fptosi %306 : f32 to i8
      %308 = arith.extsi %307 : i8 to i32
      %309 = arith.sitofp %308 : i32 to f32
      %310 = arith.mulf %309, %cst_12 : f32
      linalg.yield %310 : f32
    } -> tensor<1x256x1x1xf32>
    flow.return %296 : tensor<1x256x1x1xf32>
  }
  %277 = flow.dispatch.region -> (tensor<256x65x65xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%267 : tensor<256x65x65xf32>) {
    ^bb0(%out: f32):
      %297 = linalg.index 0 : index
      %298 = linalg.index 1 : index
      %299 = linalg.index 2 : index
      %300 = arith.index_cast %298 : index to i64
      %301 = arith.sitofp %300 : i64 to f32
      %302 = arith.addf %301, %cst_13 : f32
      %303 = arith.divf %302, %cst_7 : f32
      %304 = arith.subf %303, %cst_13 : f32
      %305 = arith.maximumf %304, %cst_15 : f32
      %306 = arith.minimumf %305, %cst_5 : f32
      %307 = arith.minimumf %305, %cst_15 : f32
      %308 = math.floor %306 : f32
      %309 = arith.addf %306, %cst_14 : f32
      %310 = math.floor %309 : f32
      %311 = arith.fptosi %308 : f32 to i64
      %312 = arith.index_cast %311 : i64 to index
      %313 = arith.fptosi %310 : f32 to i64
      %314 = arith.index_cast %313 : i64 to index
      %315 = arith.index_cast %299 : index to i64
      %316 = arith.sitofp %315 : i64 to f32
      %317 = arith.addf %316, %cst_13 : f32
      %318 = arith.divf %317, %cst_7 : f32
      %319 = arith.subf %318, %cst_13 : f32
      %320 = arith.maximumf %319, %cst_15 : f32
      %321 = arith.minimumf %320, %cst_5 : f32
      %322 = arith.minimumf %320, %cst_15 : f32
      %323 = math.floor %321 : f32
      %324 = arith.addf %321, %cst_14 : f32
      %325 = math.floor %324 : f32
      %326 = arith.fptosi %323 : f32 to i64
      %327 = arith.index_cast %326 : i64 to index
      %328 = arith.fptosi %325 : f32 to i64
      %329 = arith.index_cast %328 : i64 to index
      %extracted = tensor.extract %276[%c0, %297, %312, %327] : tensor<1x256x1x1xf32>
      %extracted_256 = tensor.extract %276[%c0, %297, %312, %329] : tensor<1x256x1x1xf32>
      %extracted_257 = tensor.extract %276[%c0, %297, %314, %327] : tensor<1x256x1x1xf32>
      %extracted_258 = tensor.extract %276[%c0, %297, %314, %329] : tensor<1x256x1x1xf32>
      %330 = arith.subf %310, %307 : f32
      %331 = arith.subf %307, %308 : f32
      %332 = arith.subf %325, %322 : f32
      %333 = arith.subf %322, %323 : f32
      %334 = arith.mulf %332, %extracted : f32
      %335 = arith.mulf %333, %extracted_256 : f32
      %336 = arith.addf %334, %335 : f32
      %337 = arith.mulf %330, %336 : f32
      %338 = arith.mulf %332, %extracted_257 : f32
      %339 = arith.mulf %333, %extracted_258 : f32
      %340 = arith.addf %338, %339 : f32
      %341 = arith.mulf %331, %340 : f32
      %342 = arith.addf %337, %341 : f32
      %343 = arith.divf %342, %cst_12 : f32
      %344 = math.round %343 : f32
      %345 = arith.addf %344, %cst_15 : f32
      %346 = arith.cmpf ult, %345, %cst_17 : f32
      %347 = arith.cmpf ugt, %345, %cst_16 : f32
      %348 = arith.select %346, %cst_17, %345 : f32
      %349 = arith.select %347, %cst_16, %348 : f32
      %350 = arith.fptosi %349 : f32 to i8
      %351 = arith.extsi %350 : i8 to i32
      %352 = arith.sitofp %351 : i32 to f32
      %353 = arith.mulf %352, %cst_12 : f32
      linalg.yield %353 : f32
    } -> tensor<256x65x65xf32>
    flow.return %296 : tensor<256x65x65xf32>
  }
  %278 = tensor.empty() : tensor<1x512x65x65xf32>
  %inserted_slice_247 = tensor.insert_slice %277 into %278[0, 0, 0, 0] [1, 256, 65, 65] [1, 1, 1, 1] : tensor<256x65x65xf32> into tensor<1x512x65x65xf32>
  %inserted_slice_248 = tensor.insert_slice %268 into %inserted_slice_247[0, 256, 0, 0] [1, 256, 65, 65] [1, 1, 1, 1] : tensor<256x65x65xf32> into tensor<1x512x65x65xf32>
  %279 = tensor.empty() : tensor<65x65x1x512xf32>
  %280 = flow.dispatch.region -> (tensor<65x65x1x512xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_248 : tensor<1x512x65x65xf32>) outs(%279 : tensor<65x65x1x512xf32>) {
    ^bb0(%in: f32, %out: f32):
      %297 = arith.divf %in, %cst_12 : f32
      %298 = math.round %297 : f32
      %299 = arith.addf %298, %cst_15 : f32
      %300 = arith.cmpf ult, %299, %cst_17 : f32
      %301 = arith.cmpf ugt, %299, %cst_16 : f32
      %302 = arith.select %300, %cst_17, %299 : f32
      %303 = arith.select %301, %cst_16, %302 : f32
      %304 = arith.fptosi %303 : f32 to i8
      %305 = arith.extsi %304 : i8 to i32
      %306 = arith.sitofp %305 : i32 to f32
      %307 = arith.mulf %306, %cst_12 : f32
      linalg.yield %307 : f32
    } -> tensor<65x65x1x512xf32>
    flow.return %296 : tensor<65x65x1x512xf32>
  }
  %collapsed_249 = tensor.collapse_shape %280 [[0], [1], [2, 3]] : tensor<65x65x1x512xf32> into tensor<65x65x512xf32>
  %281 = tensor.empty() : tensor<68x68x512xf32>
  %282 = linalg.fill ins(%cst_15 : f32) outs(%281 : tensor<68x68x512xf32>) -> tensor<68x68x512xf32>
  %inserted_slice_250 = tensor.insert_slice %collapsed_249 into %282[0, 0, 0] [65, 65, 512] [1, 1, 1] : tensor<65x65x512xf32> into tensor<68x68x512xf32>
  %expanded_251 = tensor.expand_shape %inserted_slice_250 [[0, 1], [2], [3]] output_shape [1, 68, 68, 512] : tensor<68x68x512xf32> into tensor<1x68x68x512xf32>
  %283 = flow.dispatch.region -> (tensor<1x68x68x256xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_251, %cst_108 : tensor<1x68x68x512xf32>, tensor<1x1x512x256xf32>) outs(%264 : tensor<1x68x68x256xf32>) -> tensor<1x68x68x256xf32>
    flow.return %296 : tensor<1x68x68x256xf32>
  }
  %extracted_slice_252 = tensor.extract_slice %283[0, 0, 0, 0] [1, 65, 65, 256] [1, 1, 1, 1] : tensor<1x68x68x256xf32> to tensor<65x65x256xf32>
  %284 = tensor.empty() : tensor<68x68x256xf32>
  %285 = linalg.fill ins(%cst_15 : f32) outs(%284 : tensor<68x68x256xf32>) -> tensor<68x68x256xf32>
  %286 = flow.dispatch.region -> (tensor<68x68x256xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_252, %cst_72 : tensor<65x65x256xf32>, tensor<256xf32>) outs(%260 : tensor<65x65x256xf32>) {
    ^bb0(%in: f32, %in_257: f32, %out: f32):
      %297 = arith.addf %in, %in_257 : f32
      %298 = arith.cmpf ugt, %297, %cst_15 : f32
      %299 = arith.select %298, %297, %cst_15 : f32
      %300 = arith.divf %299, %cst_11 : f32
      %301 = math.round %300 : f32
      %302 = arith.addf %301, %cst_15 : f32
      %303 = arith.cmpf ult, %302, %cst_17 : f32
      %304 = arith.cmpf ugt, %302, %cst_16 : f32
      %305 = arith.select %303, %cst_17, %302 : f32
      %306 = arith.select %304, %cst_16, %305 : f32
      %307 = arith.fptosi %306 : f32 to i8
      %308 = arith.extsi %307 : i8 to i32
      %309 = arith.sitofp %308 : i32 to f32
      %310 = arith.mulf %309, %cst_11 : f32
      linalg.yield %310 : f32
    } -> tensor<65x65x256xf32>
    %inserted_slice_256 = tensor.insert_slice %296 into %285[0, 0, 0] [65, 65, 256] [1, 1, 1] : tensor<65x65x256xf32> into tensor<68x68x256xf32>
    flow.return %inserted_slice_256 : tensor<68x68x256xf32>
  }
  %expanded_253 = tensor.expand_shape %286 [[0, 1], [2], [3]] output_shape [1, 68, 68, 256] : tensor<68x68x256xf32> into tensor<1x68x68x256xf32>
  %287 = tensor.empty() : tensor<1x68x68x24xf32>
  %288 = linalg.fill ins(%cst_15 : f32) outs(%287 : tensor<1x68x68x24xf32>) -> tensor<1x68x68x24xf32>
  %289 = flow.dispatch.region -> (tensor<1x68x68x24xf32>) {
    %296 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%expanded_253, %cst_3 : tensor<1x68x68x256xf32>, tensor<1x1x256x24xf32>) outs(%288 : tensor<1x68x68x24xf32>) -> tensor<1x68x68x24xf32>
    flow.return %296 : tensor<1x68x68x24xf32>
  }
  %extracted_slice_254 = tensor.extract_slice %289[0, 0, 0, 0] [1, 65, 65, 21] [1, 1, 1, 1] : tensor<1x68x68x24xf32> to tensor<65x65x21xf32>
  %expanded_255 = tensor.expand_shape %extracted_slice_254 [[0], [1], [2, 3]] output_shape [65, 65, 1, 21] : tensor<65x65x21xf32> into tensor<65x65x1x21xf32>
  %290 = tensor.empty() : tensor<1x21x65x65xi8>
  %291 = flow.dispatch.region -> (tensor<1x21x65x65xi8>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d3, d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_255, %cst : tensor<65x65x1x21xf32>, tensor<1x21xf32>) outs(%290 : tensor<1x21x65x65xi8>) {
    ^bb0(%in: f32, %in_256: f32, %out: i8):
      %297 = arith.addf %in, %in_256 : f32
      %298 = arith.divf %297, %cst_10 : f32
      %299 = math.round %298 : f32
      %300 = arith.addf %299, %cst_15 : f32
      %301 = arith.cmpf ult, %300, %cst_17 : f32
      %302 = arith.cmpf ugt, %300, %cst_16 : f32
      %303 = arith.select %301, %cst_17, %300 : f32
      %304 = arith.select %302, %cst_16, %303 : f32
      %305 = arith.fptosi %304 : f32 to i8
      linalg.yield %305 : i8
    } -> tensor<1x21x65x65xi8>
    flow.return %296 : tensor<1x21x65x65xi8>
  }
  %292 = tensor.empty() : tensor<1x513x513x21xf32>
  %293 = flow.dispatch.region -> (tensor<1x513x513x21xf32>) {
    %296 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%292 : tensor<1x513x513x21xf32>) {
    ^bb0(%out: f32):
      %297 = linalg.index 0 : index
      %298 = linalg.index 1 : index
      %299 = linalg.index 2 : index
      %300 = linalg.index 3 : index
      %301 = affine.apply affine_map<(d0, d1) -> (d0 + d1 * 513)>(%298, %297)
      %302 = arith.index_cast %301 : index to i64
      %303 = arith.sitofp %302 : i64 to f32
      %304 = arith.addf %303, %cst_13 : f32
      %305 = arith.divf %304, %cst_6 : f32
      %306 = arith.subf %305, %cst_13 : f32
      %307 = arith.maximumf %306, %cst_15 : f32
      %308 = arith.minimumf %307, %cst_4 : f32
      %309 = math.floor %308 : f32
      %310 = arith.addf %308, %cst_14 : f32
      %311 = math.floor %310 : f32
      %312 = arith.fptosi %309 : f32 to i64
      %313 = arith.index_cast %312 : i64 to index
      %314 = arith.fptosi %311 : f32 to i64
      %315 = arith.index_cast %314 : i64 to index
      %316 = arith.index_cast %299 : index to i64
      %317 = arith.sitofp %316 : i64 to f32
      %318 = arith.addf %317, %cst_13 : f32
      %319 = arith.divf %318, %cst_6 : f32
      %320 = arith.subf %319, %cst_13 : f32
      %321 = arith.maximumf %320, %cst_15 : f32
      %322 = arith.minimumf %321, %cst_4 : f32
      %323 = math.floor %322 : f32
      %324 = arith.addf %322, %cst_14 : f32
      %325 = math.floor %324 : f32
      %326 = arith.fptosi %323 : f32 to i64
      %327 = arith.index_cast %326 : i64 to index
      %328 = arith.fptosi %325 : f32 to i64
      %329 = arith.index_cast %328 : i64 to index
      %extracted = tensor.extract %291[%c0, %300, %313, %327] : tensor<1x21x65x65xi8>
      %330 = arith.extsi %extracted : i8 to i32
      %331 = arith.sitofp %330 : i32 to f32
      %332 = arith.mulf %331, %cst_10 : f32
      %extracted_256 = tensor.extract %291[%c0, %300, %313, %329] : tensor<1x21x65x65xi8>
      %333 = arith.extsi %extracted_256 : i8 to i32
      %334 = arith.sitofp %333 : i32 to f32
      %335 = arith.mulf %334, %cst_10 : f32
      %extracted_257 = tensor.extract %291[%c0, %300, %315, %327] : tensor<1x21x65x65xi8>
      %336 = arith.extsi %extracted_257 : i8 to i32
      %337 = arith.sitofp %336 : i32 to f32
      %338 = arith.mulf %337, %cst_10 : f32
      %extracted_258 = tensor.extract %291[%c0, %300, %315, %329] : tensor<1x21x65x65xi8>
      %339 = arith.extsi %extracted_258 : i8 to i32
      %340 = arith.sitofp %339 : i32 to f32
      %341 = arith.mulf %340, %cst_10 : f32
      %342 = arith.subf %311, %308 : f32
      %343 = arith.subf %308, %309 : f32
      %344 = arith.subf %325, %322 : f32
      %345 = arith.subf %322, %323 : f32
      %346 = arith.mulf %344, %332 : f32
      %347 = arith.mulf %345, %335 : f32
      %348 = arith.addf %346, %347 : f32
      %349 = arith.mulf %342, %348 : f32
      %350 = arith.mulf %344, %338 : f32
      %351 = arith.mulf %345, %341 : f32
      %352 = arith.addf %350, %351 : f32
      %353 = arith.mulf %343, %352 : f32
      %354 = arith.addf %349, %353 : f32
      %355 = arith.divf %354, %cst_10 : f32
      %356 = math.round %355 : f32
      %357 = arith.addf %356, %cst_15 : f32
      %358 = arith.cmpf ult, %357, %cst_17 : f32
      %359 = arith.cmpf ugt, %357, %cst_16 : f32
      %360 = arith.select %358, %cst_17, %357 : f32
      %361 = arith.select %359, %cst_16, %360 : f32
      %362 = arith.fptosi %361 : f32 to i8
      %363 = arith.extsi %362 : i8 to i32
      %364 = arith.sitofp %363 : i32 to f32
      %365 = arith.mulf %364, %cst_10 : f32
      %366 = arith.divf %365, %cst_10 : f32
      %367 = math.round %366 : f32
      %368 = arith.addf %367, %cst_15 : f32
      %369 = arith.cmpf ult, %368, %cst_17 : f32
      %370 = arith.cmpf ugt, %368, %cst_16 : f32
      %371 = arith.select %369, %cst_17, %368 : f32
      %372 = arith.select %370, %cst_16, %371 : f32
      %373 = arith.fptosi %372 : f32 to i8
      %374 = arith.extsi %373 : i8 to i32
      %375 = arith.sitofp %374 : i32 to f32
      %376 = arith.mulf %375, %cst_10 : f32
      linalg.yield %376 : f32
    } -> tensor<1x513x513x21xf32>
    flow.return %296 : tensor<1x513x513x21xf32>
  }
  %294 = hal.tensor.barrier join(%293 : tensor<1x513x513x21xf32>) => %arg2 : !hal.fence
  %295 = hal.tensor.export %294 : tensor<1x513x513x21xf32> -> !hal.buffer_view
  util.return %295 : !hal.buffer_view
}