module {
  func.func @forward_dispatch_0_generic_D() {
    %cst = arith.constant dense<0.000000e+00> : vector<4xf32>
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load[0] : i32
    %1 = hal.interface.constant.load[1] : i32
    %2 = hal.interface.constant.load[2] : i32
    %3 = hal.interface.constant.load[3] : i32
    %4 = arith.index_castui %0 : i32 to index
    %5 = arith.index_castui %1 : i32 to index
    %6 = arith.index_castui %2 : i32 to index
    %7 = arith.index_castui %3 : i32 to index
    %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
    %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
    %10 = arith.index_cast %4 : index to i64
    %11 = arith.index_cast %5 : index to i64
    %12 = arith.muli %11, %10 : i64
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
    %14 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
    %15 = arith.sitofp %12 : i64 to f32
    %16 = vector.broadcast %15 : f32 to vector<4xf32>
    scf.for %arg0 = %13 to %6 step %14 {
      %17 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 32)>(%arg0)[%6]
      %18 = flow.dispatch.tensor.load %9, offsets = [%arg0], sizes = [%17], strides = [1] : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6} -> tensor<?xf32>
      %19 = flow.dispatch.tensor.load %8, offsets = [%arg0, 0], sizes = [%17, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
      %20 = affine.apply affine_map<()[s0] -> ((s0 floordiv 4) * 4)>()[%17]
      %21 = scf.for %arg1 = %c0 to %20 step %c4 iter_args(%arg2 = %18) -> (tensor<?xf32>) {
        %23 = scf.for %arg3 = %c0 to %7 step %c1 iter_args(%arg4 = %cst) -> (vector<4xf32>) {
          %26 = vector.transfer_read %19[%arg1, %arg3], %cst_0 {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x1xf32>
          %27 = vector.shape_cast %26 : vector<4x1xf32> to vector<4xf32>
          %28 = arith.addf %arg4, %27 : vector<4xf32>
          scf.yield %28 : vector<4xf32>
        }
        %24 = arith.divf %23, %16 : vector<4xf32>
        %25 = vector.transfer_write %24, %arg2[%arg1] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32>
        scf.yield %25 : tensor<?xf32>
      }
      %22 = scf.for %arg1 = %20 to %17 step %c4 iter_args(%arg2 = %21) -> (tensor<?xf32>) {
        %23 = affine.apply affine_map<(d0, d1) -> (-d0 + d1)>(%arg1, %17)
        %extracted_slice = tensor.extract_slice %19[%arg1, 0] [%23, %7] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
        %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [%23] [1] : tensor<?xf32> to tensor<?xf32>
        %extracted_slice_2 = tensor.extract_slice %extracted_slice_1[0] [%23] [1] : tensor<?xf32> to tensor<?xf32>
        %24 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_2 : tensor<?xf32>) -> tensor<?xf32>
        %25 = scf.for %arg3 = %c0 to %7 step %c1 iter_args(%arg4 = %24) -> (tensor<?xf32>) {
          %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg3] [%23, 1] [1, 1] : tensor<?x?xf32> to tensor<?x1xf32>
          %extracted_slice_5 = tensor.extract_slice %arg4[0] [%23] [1] : tensor<?xf32> to tensor<?xf32>
          %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice_4 : tensor<?x1xf32>) outs(%extracted_slice_5 : tensor<?xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 0], [4, 0], [0, 1]]>} {
          ^bb0(%in: f32, %out: f32):
            %28 = arith.addf %in, %out : f32
            linalg.yield %28 : f32
          } -> tensor<?xf32>
          %inserted_slice_6 = tensor.insert_slice %27 into %arg4[0] [%23] [1] : tensor<?xf32> into tensor<?xf32>
          scf.yield %inserted_slice_6 : tensor<?xf32>
        }
        %dim = tensor.dim %24, %c0 : tensor<?xf32>
        %extracted_slice_3 = tensor.extract_slice %25[0] [%dim] [1] : tensor<?xf32> to tensor<?xf32>
        %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} outs(%extracted_slice_3 : tensor<?xf32>) {
        ^bb0(%out: f32):
          %27 = arith.divf %out, %15 : f32
          linalg.yield %27 : f32
        } -> tensor<?xf32>
        %inserted_slice = tensor.insert_slice %26 into %arg2[%arg1] [%23] [1] : tensor<?xf32> into tensor<?xf32>
        scf.yield %inserted_slice : tensor<?xf32>
      }
      flow.dispatch.tensor.store %22, %9, offsets = [%arg0], sizes = [%17], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
    }
    return
  }
}