module { func.func @forward_dispatch_0_generic_D() { %cst = arith.constant dense<0.000000e+00> : vector<4xf32> %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.000000e+00 : f32 %0 = hal.interface.constant.load[0] : i32 %1 = hal.interface.constant.load[1] : i32 %2 = hal.interface.constant.load[2] : i32 %3 = hal.interface.constant.load[3] : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor>{%6, %7} %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor>{%6} %10 = arith.index_cast %4 : index to i64 %11 = arith.index_cast %5 : index to i64 %12 = arith.muli %11, %10 : i64 %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %13 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] %14 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] %15 = arith.sitofp %12 : i64 to f32 %16 = vector.broadcast %15 : f32 to vector<4xf32> scf.for %arg0 = %13 to %6 step %14 { %17 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 32)>(%arg0)[%6] %18 = flow.dispatch.tensor.load %9, offsets = [%arg0], sizes = [%17], strides = [1] : !flow.dispatch.tensor>{%6} -> tensor %19 = flow.dispatch.tensor.load %8, offsets = [%arg0, 0], sizes = [%17, %7], strides = [1, 1] : !flow.dispatch.tensor>{%6, %7} -> tensor %20 = affine.apply affine_map<()[s0] -> ((s0 floordiv 4) * 4)>()[%17] %21 = scf.for %arg1 = %c0 to %20 step %c4 iter_args(%arg2 = %18) -> (tensor) { %23 = scf.for %arg3 = %c0 to %7 step %c1 iter_args(%arg4 = %cst) -> (vector<4xf32>) { %26 = vector.transfer_read %19[%arg1, %arg3], %cst_0 {in_bounds = [true, true]} : tensor, vector<4x1xf32> %27 = vector.shape_cast %26 : vector<4x1xf32> to vector<4xf32> %28 = arith.addf %arg4, %27 : vector<4xf32> scf.yield %28 : vector<4xf32> } %24 = arith.divf %23, %16 : vector<4xf32> %25 = vector.transfer_write %24, %arg2[%arg1] {in_bounds = [true]} : vector<4xf32>, tensor scf.yield %25 : tensor } %22 = scf.for %arg1 = %20 to %17 step %c4 iter_args(%arg2 = %21) -> (tensor) { %23 = affine.apply affine_map<(d0, d1) -> (-d0 + d1)>(%arg1, %17) %extracted_slice = tensor.extract_slice %19[%arg1, 0] [%23, %7] [1, 1] : tensor to tensor %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [%23] [1] : tensor to tensor %extracted_slice_2 = tensor.extract_slice %extracted_slice_1[0] [%23] [1] : tensor to tensor %24 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_2 : tensor) -> tensor %25 = scf.for %arg3 = %c0 to %7 step %c1 iter_args(%arg4 = %24) -> (tensor) { %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %arg3] [%23, 1] [1, 1] : tensor to tensor %extracted_slice_5 = tensor.extract_slice %arg4[0] [%23] [1] : tensor to tensor %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice_4 : tensor) outs(%extracted_slice_5 : tensor) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): %28 = arith.addf %in, %out : f32 linalg.yield %28 : f32 } -> tensor %inserted_slice_6 = tensor.insert_slice %27 into %arg4[0] [%23] [1] : tensor into tensor scf.yield %inserted_slice_6 : tensor } %dim = tensor.dim %24, %c0 : tensor %extracted_slice_3 = tensor.extract_slice %25[0] [%dim] [1] : tensor to tensor %26 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} outs(%extracted_slice_3 : tensor) { ^bb0(%out: f32): %27 = arith.divf %out, %15 : f32 linalg.yield %27 : f32 } -> tensor %inserted_slice = tensor.insert_slice %26 into %arg2[%arg1] [%23] [1] : tensor into tensor scf.yield %inserted_slice : tensor } flow.dispatch.tensor.store %22, %9, offsets = [%arg0], sizes = [%17], strides = [1] : tensor -> !flow.dispatch.tensor>{%6} } return } }