Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Last active March 3, 2024 18:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antiagainst/70e8cfa1bbc1cb7f2b151443affdc3fd to your computer and use it in GitHub Desktop.
Save antiagainst/70e8cfa1bbc1cb7f2b151443affdc3fd to your computer and use it in GitHub Desktop.
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>> -> tensor<1x2x16x16xf16>
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%11 = flow.dispatch.tensor.load %0, offsets = [%4, 0, %9, %10], sizes = [1, 8, 33, 33], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>> -> tensor<1x8x33x33xf16>
%12 = flow.dispatch.tensor.load %1, offsets = [%5, 0, 0, 0], sizes = [2, 8, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>> -> tensor<2x8x3x3xf16>
%13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} ins(%cst : f16) outs(%8 : tensor<1x2x16x16xf16>) -> tensor<1x2x16x16xf16>
%14 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>, strides = dense<2> : vector<2xi64>} ins(%11, %12 : tensor<1x8x33x33xf16>, tensor<2x8x3x3xf16>) outs(%13 : tensor<1x2x16x16xf16>) -> tensor<1x2x16x16xf16>
%15 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xf16>> -> tensor<2xf16>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2xf16>) outs(%14 : tensor<1x2x16x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} {
^bb0(%in: f16, %out: f16):
%17 = arith.addf %out, %in : f16
linalg.yield %17 : f16
} -> tensor<1x2x16x16xf16>
flow.dispatch.tensor.store %16, %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : tensor<1x2x16x16xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
return
}
}
// -----// IR Dump After GPUTensorTile (iree-codegen-gpu-tensor-tile) //----- //
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>> -> tensor<1x2x16x16xf16>
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%11 = flow.dispatch.tensor.load %0, offsets = [%4, 0, %9, %10], sizes = [1, 8, 33, 33], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>> -> tensor<1x8x33x33xf16>
%12 = flow.dispatch.tensor.load %1, offsets = [%5, 0, 0, 0], sizes = [2, 8, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>> -> tensor<2x8x3x3xf16>
%13 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %8) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = linalg.fill {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} ins(%cst : f16) outs(%extracted_slice : tensor<1x2x2x4xf16>) -> tensor<1x2x2x4xf16>
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %20, %21] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%14 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %13) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg1)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %11[0, 0, %17, %18] [1, 8, 5, 9] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x8x5x9xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, 0, %19, %20] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%21 = scf.for %arg3 = %c0 to %c8 step %c4 iter_args(%arg4 = %extracted_slice_0) -> (tensor<1x2x2x4xf16>) {
%24 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x2x2x4xf16>) {
%25 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x2x2x4xf16>) {
%extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %arg3, %arg5, %arg7] [1, 4, 3, 7] [1, 1, 1, 1] : tensor<1x8x5x9xf16> to tensor<1x4x3x7xf16>
%extracted_slice_2 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1x1xf16>
%26 = linalg.conv_2d_nchw_fchw {__internal_linalg_transform__ = "workgroup_k_tiled", dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>, strides = dense<2> : vector<2xi64>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x4x3x7xf16>, tensor<2x4x1x1xf16>) outs(%arg8 : tensor<1x2x2x4xf16>) -> tensor<1x2x2x4xf16>
%c0_3 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c1_4 = arith.constant 1 : index
%c2_5 = arith.constant 2 : index
%27 = affine.apply affine_map<(d0) -> (d0 * 2)>(%c0_3)
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, %arg3, %arg5, %arg7] [1, 4, 3, 7] [1, 1, 1, 1] : tensor<1x8x5x9xf16> to tensor<1x4x3x7xf16>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_6[0, 0, %27, 0] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x4x3x7xf16> to tensor<1x4x1x7xf16>
%extracted_slice_8 = tensor.extract_slice %extracted_slice_1[0, 0, %27, 0] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x4x3x7xf16> to tensor<1x4x1x7xf16>
%extracted_slice_9 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1x1xf16>
%extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x4x1x1xf16> to tensor<2x4x1x1xf16>
%extracted_slice_11 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x4x1x1xf16> to tensor<2x4x1x1xf16>
%extracted_slice_12 = tensor.extract_slice %arg8[0, 0, %c0_3, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x1x4xf16>
%28 = linalg.conv_2d_nchw_fchw {__internal_linalg_transform__ = "workgroup_k_tiled", dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>, strides = dense<2> : vector<2xi64>} ins(%extracted_slice_8, %extracted_slice_11 : tensor<1x4x1x7xf16>, tensor<2x4x1x1xf16>) outs(%extracted_slice_12 : tensor<1x2x1x4xf16>) -> tensor<1x2x1x4xf16>
%inserted_slice = tensor.insert_slice %28 into %arg8[0, 0, %c0_3, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x1x4xf16> into tensor<1x2x2x4xf16>
%c1_13 = arith.constant 1 : index
%29 = arith.muli %c1_4, %c1_13 : index
%30 = arith.addi %c0_3, %29 : index
%31 = affine.apply affine_map<(d0) -> (d0 * 2)>(%30)
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %arg3, %arg5, %arg7] [1, 4, 3, 7] [1, 1, 1, 1] : tensor<1x8x5x9xf16> to tensor<1x4x3x7xf16>
%extracted_slice_15 = tensor.extract_slice %extracted_slice_14[0, 0, %31, 0] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x4x3x7xf16> to tensor<1x4x1x7xf16>
%extracted_slice_16 = tensor.extract_slice %extracted_slice_1[0, 0, %31, 0] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x4x3x7xf16> to tensor<1x4x1x7xf16>
%extracted_slice_17 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1x1xf16>
%extracted_slice_18 = tensor.extract_slice %extracted_slice_17[0, 0, 0, 0] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x4x1x1xf16> to tensor<2x4x1x1xf16>
%extracted_slice_19 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x4x1x1xf16> to tensor<2x4x1x1xf16>
%extracted_slice_20 = tensor.extract_slice %inserted_slice[0, 0, %30, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x1x4xf16>
%32 = linalg.conv_2d_nchw_fchw {__internal_linalg_transform__ = "workgroup_k_tiled", dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>, strides = dense<2> : vector<2xi64>} ins(%extracted_slice_16, %extracted_slice_19 : tensor<1x4x1x7xf16>, tensor<2x4x1x1xf16>) outs(%extracted_slice_20 : tensor<1x2x1x4xf16>) -> tensor<1x2x1x4xf16>
%inserted_slice_21 = tensor.insert_slice %32 into %inserted_slice[0, 0, %30, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x1x4xf16> into tensor<1x2x2x4xf16>
scf.yield %inserted_slice_21 : tensor<1x2x2x4xf16>
}
scf.yield %25 : tensor<1x2x2x4xf16>
}
scf.yield %24 : tensor<1x2x2x4xf16>
}
%22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg2[0, 0, %22, %23] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%15 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xf16>> -> tensor<2xf16>
%16 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %14) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2xf16>) outs(%extracted_slice : tensor<1x2x2x4xf16>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} {
^bb0(%in: f16, %out: f16):
%22 = arith.addf %out, %in : f16
linalg.yield %22 : f16
} -> tensor<1x2x2x4xf16>
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %20, %21] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %16, %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : tensor<1x2x16x16xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>> -> tensor<1x2x16x16xf16>
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%11 = flow.dispatch.tensor.load %0, offsets = [%4, 0, %9, %10], sizes = [1, 8, 33, 33], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>> -> tensor<1x8x33x33xf16>
%12 = flow.dispatch.tensor.load %1, offsets = [%5, 0, 0, 0], sizes = [2, 8, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>> -> tensor<2x8x3x3xf16>
%13 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %8) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = linalg.fill {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} ins(%cst : f16) outs(%extracted_slice : tensor<1x2x2x4xf16>) -> tensor<1x2x2x4xf16>
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %20, %21] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%14 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %13) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg1)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %11[0, 0, %17, %18] [1, 8, 5, 9] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x8x5x9xf16>
%extracted_slice_0 = tensor.extract_slice %arg2[0, 0, %19, %20] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%21 = scf.for %arg3 = %c0 to %c8 step %c4 iter_args(%arg4 = %extracted_slice_0) -> (tensor<1x2x2x4xf16>) {
%24 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x2x2x4xf16>) {
%25 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x2x2x4xf16>) {
%extracted_slice_1 = tensor.extract_slice %extracted_slice[0, %arg3, %arg5, %arg7] [1, 4, 3, 7] [1, 1, 1, 1] : tensor<1x8x5x9xf16> to tensor<1x4x3x7xf16>
%extracted_slice_2 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1x1xf16>
%extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0, 0, 0] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x4x3x7xf16> to tensor<1x4x1x7xf16>
%extracted_slice_4 = tensor.extract_slice %arg8[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x1x4xf16>
%26 = linalg.conv_2d_nchw_fchw {__internal_linalg_transform__ = "workgroup_k_tiled", dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>, strides = dense<2> : vector<2xi64>} ins(%extracted_slice_3, %extracted_slice_2 : tensor<1x4x1x7xf16>, tensor<2x4x1x1xf16>) outs(%extracted_slice_4 : tensor<1x2x1x4xf16>) -> tensor<1x2x1x4xf16>
%inserted_slice = tensor.insert_slice %26 into %arg8[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x1x4xf16> into tensor<1x2x2x4xf16>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_1[0, 0, 2, 0] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x4x3x7xf16> to tensor<1x4x1x7xf16>
%extracted_slice_6 = tensor.extract_slice %inserted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x1x4xf16>
%27 = linalg.conv_2d_nchw_fchw {__internal_linalg_transform__ = "workgroup_k_tiled", dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>, strides = dense<2> : vector<2xi64>} ins(%extracted_slice_5, %extracted_slice_2 : tensor<1x4x1x7xf16>, tensor<2x4x1x1xf16>) outs(%extracted_slice_6 : tensor<1x2x1x4xf16>) -> tensor<1x2x1x4xf16>
%inserted_slice_7 = tensor.insert_slice %27 into %inserted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x1x4xf16> into tensor<1x2x2x4xf16>
scf.yield %inserted_slice_7 : tensor<1x2x2x4xf16>
}
scf.yield %25 : tensor<1x2x2x4xf16>
}
scf.yield %24 : tensor<1x2x2x4xf16>
}
%22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg2[0, 0, %22, %23] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%15 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xf16>> -> tensor<2xf16>
%16 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %14) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2xf16>) outs(%extracted_slice : tensor<1x2x2x4xf16>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} {
^bb0(%in: f16, %out: f16):
%22 = arith.addf %out, %in : f16
linalg.yield %22 : f16
} -> tensor<1x2x2x4xf16>
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %20, %21] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %16, %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : tensor<1x2x16x16xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
return
}
}
// -----// IR Dump After DecomposeConvolutionToLowerDimOps (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>> -> tensor<1x2x16x16xf16>
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%11 = flow.dispatch.tensor.load %0, offsets = [%4, 0, %9, %10], sizes = [1, 8, 33, 33], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>> -> tensor<1x8x33x33xf16>
%12 = flow.dispatch.tensor.load %1, offsets = [%5, 0, 0, 0], sizes = [2, 8, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>> -> tensor<2x8x3x3xf16>
%13 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %8) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = linalg.fill {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} ins(%cst : f16) outs(%extracted_slice : tensor<1x2x2x4xf16>) -> tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%14 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %13) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = scf.for %arg3 = %c0 to %c8 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<1x2x2x4xf16>) {
%20 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x2x2x4xf16>) {
%21 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x2x2x4xf16>) {
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%arg0)[%arg5]
%23 = affine.apply affine_map<(d0)[s0] -> (d0 * 8 + s0)>(%arg1)[%arg7]
%extracted_slice_0 = tensor.extract_slice %11[0, %arg3, %22, %23] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x4x7xf16>
%extracted_slice_1 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1xf16>
%extracted_slice_2 = tensor.extract_slice %arg8[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x4xf16>
%24 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x4x7xf16>, tensor<2x4x1xf16>) outs(%extracted_slice_2 : tensor<1x2x4xf16>) -> tensor<1x2x4xf16>
%inserted_slice = tensor.insert_slice %24 into %arg8[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x4xf16> into tensor<1x2x2x4xf16>
%25 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0 + 2)>(%arg0)[%arg5]
%26 = affine.apply affine_map<(d0)[s0] -> (d0 * 8 + s0)>(%arg1)[%arg7]
%extracted_slice_3 = tensor.extract_slice %11[0, %arg3, %25, %26] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x4x7xf16>
%extracted_slice_4 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1xf16>
%extracted_slice_5 = tensor.extract_slice %inserted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x4xf16>
%27 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x4x7xf16>, tensor<2x4x1xf16>) outs(%extracted_slice_5 : tensor<1x2x4xf16>) -> tensor<1x2x4xf16>
%inserted_slice_6 = tensor.insert_slice %27 into %inserted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x4xf16> into tensor<1x2x2x4xf16>
scf.yield %inserted_slice_6 : tensor<1x2x2x4xf16>
}
scf.yield %21 : tensor<1x2x2x4xf16>
}
scf.yield %20 : tensor<1x2x2x4xf16>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%15 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xf16>> -> tensor<2xf16>
%16 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %14) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2xf16>) outs(%extracted_slice : tensor<1x2x2x4xf16>) attrs = {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 16, 16, 4, 1, 1], [0, 0, 1, 0]]>} {
^bb0(%in: f16, %out: f16):
%20 = arith.addf %out, %in : f16
linalg.yield %20 : f16
} -> tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %16, %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : tensor<1x2x16x16xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
return
}
// -----// IR Dump After GenericVectorization (iree-codegen-generic-vectorization) //----- //
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x2x4xf16>
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>> -> tensor<1x2x16x16xf16>
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%11 = flow.dispatch.tensor.load %0, offsets = [%4, 0, %9, %10], sizes = [1, 8, 33, 33], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>> -> tensor<1x8x33x33xf16>
%12 = flow.dispatch.tensor.load %1, offsets = [%5, 0, 0, 0], sizes = [2, 8, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>> -> tensor<2x8x3x3xf16>
%13 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %8) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = vector.transfer_write %cst, %extracted_slice[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf16>, tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%14 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %13) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = scf.for %arg3 = %c0 to %c8 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<1x2x2x4xf16>) {
%20 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x2x2x4xf16>) {
%21 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x2x2x4xf16>) {
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%arg0)[%arg5]
%23 = affine.apply affine_map<(d0)[s0] -> (d0 * 8 + s0)>(%arg1)[%arg7]
%extracted_slice_1 = tensor.extract_slice %11[0, %arg3, %22, %23] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x4x7xf16>
%extracted_slice_2 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1xf16>
%extracted_slice_3 = tensor.extract_slice %arg8[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x4xf16>
%24 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x4x7xf16>, vector<1x4x7xf16>
%25 = vector.transfer_read %extracted_slice_2[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<2x4x1xf16>, vector<2x4x1xf16>
%26 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x2x4xf16>, vector<1x2x4xf16>
%27 = vector.transpose %24, [0, 2, 1] : vector<1x4x7xf16> to vector<1x7x4xf16>
%28 = vector.transpose %25, [2, 1, 0] : vector<2x4x1xf16> to vector<1x4x2xf16>
%29 = vector.transpose %26, [0, 2, 1] : vector<1x2x4xf16> to vector<1x4x2xf16>
%30 = vector.extract_strided_slice %27 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%31 = vector.extract_strided_slice %27 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%32 = vector.extract_strided_slice %27 {offsets = [0, 4, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%33 = vector.extract_strided_slice %27 {offsets = [0, 6, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%34 = vector.extract %28[0] : vector<4x2xf16> from vector<1x4x2xf16>
%35 = vector.extract_strided_slice %29 {offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%36 = vector.extract_strided_slice %29 {offsets = [0, 1, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%37 = vector.extract_strided_slice %29 {offsets = [0, 2, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%38 = vector.extract_strided_slice %29 {offsets = [0, 3, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%39 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %34, %35 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %31, %34, %36 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %32, %34, %37 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %34, %38 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%43 = vector.insert_strided_slice %39, %29 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%44 = vector.insert_strided_slice %40, %43 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%45 = vector.insert_strided_slice %41, %44 {offsets = [0, 2, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%46 = vector.insert_strided_slice %42, %45 {offsets = [0, 3, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%47 = vector.transpose %46, [0, 2, 1] : vector<1x4x2xf16> to vector<1x2x4xf16>
%48 = vector.transfer_write %47, %extracted_slice_3[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf16>, tensor<1x2x4xf16>
%inserted_slice = tensor.insert_slice %48 into %arg8[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x4xf16> into tensor<1x2x2x4xf16>
%49 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0 + 2)>(%arg0)[%arg5]
%50 = affine.apply affine_map<(d0)[s0] -> (d0 * 8 + s0)>(%arg1)[%arg7]
%extracted_slice_4 = tensor.extract_slice %11[0, %arg3, %49, %50] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x4x7xf16>
%extracted_slice_5 = tensor.extract_slice %12[0, %arg3, %arg5, %arg7] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1xf16>
%extracted_slice_6 = tensor.extract_slice %inserted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> to tensor<1x2x4xf16>
%51 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x4x7xf16>, vector<1x4x7xf16>
%52 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<2x4x1xf16>, vector<2x4x1xf16>
%53 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x2x4xf16>, vector<1x2x4xf16>
%54 = vector.transpose %51, [0, 2, 1] : vector<1x4x7xf16> to vector<1x7x4xf16>
%55 = vector.transpose %52, [2, 1, 0] : vector<2x4x1xf16> to vector<1x4x2xf16>
%56 = vector.transpose %53, [0, 2, 1] : vector<1x2x4xf16> to vector<1x4x2xf16>
%57 = vector.extract_strided_slice %54 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%58 = vector.extract_strided_slice %54 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%59 = vector.extract_strided_slice %54 {offsets = [0, 4, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%60 = vector.extract_strided_slice %54 {offsets = [0, 6, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%61 = vector.extract %55[0] : vector<4x2xf16> from vector<1x4x2xf16>
%62 = vector.extract_strided_slice %56 {offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%63 = vector.extract_strided_slice %56 {offsets = [0, 1, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%64 = vector.extract_strided_slice %56 {offsets = [0, 2, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%65 = vector.extract_strided_slice %56 {offsets = [0, 3, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %61, %62 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%67 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %61, %63 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %61, %64 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%69 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %65 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%70 = vector.insert_strided_slice %66, %56 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%71 = vector.insert_strided_slice %67, %70 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%72 = vector.insert_strided_slice %68, %71 {offsets = [0, 2, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%73 = vector.insert_strided_slice %69, %72 {offsets = [0, 3, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%74 = vector.transpose %73, [0, 2, 1] : vector<1x4x2xf16> to vector<1x2x4xf16>
%75 = vector.transfer_write %74, %extracted_slice_6[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf16>, tensor<1x2x4xf16>
%inserted_slice_7 = tensor.insert_slice %75 into %inserted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x4xf16> into tensor<1x2x2x4xf16>
scf.yield %inserted_slice_7 : tensor<1x2x2x4xf16>
}
scf.yield %21 : tensor<1x2x2x4xf16>
}
scf.yield %20 : tensor<1x2x2x4xf16>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%15 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xf16>> -> tensor<2xf16>
%16 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %14) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = vector.transfer_read %15[%c0], %cst_0 {in_bounds = [true]} : tensor<2xf16>, vector<2xf16>
%20 = vector.broadcast %19 : vector<2xf16> to vector<1x2x4x2xf16>
%21 = vector.transpose %20, [0, 3, 1, 2] : vector<1x2x4x2xf16> to vector<1x2x2x4xf16>
%22 = vector.transfer_read %extracted_slice[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x2x2x4xf16>, vector<1x2x2x4xf16>
%23 = arith.addf %22, %21 : vector<1x2x2x4xf16>
%24 = vector.transfer_write %23, %extracted_slice[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf16>, tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %16, %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : tensor<1x2x16x16xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x2x4xf16>
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%8 = flow.dispatch.tensor.load %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>> -> tensor<1x2x16x16xf16>
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%11 = flow.dispatch.tensor.load %0, offsets = [%4, 0, %9, %10], sizes = [1, 8, 33, 33], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>> -> tensor<1x8x33x33xf16>
%12 = flow.dispatch.tensor.load %1, offsets = [%5, 0, 0, 0], sizes = [2, 8, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>> -> tensor<2x8x3x3xf16>
%13 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %8) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = vector.transfer_write %cst, %extracted_slice[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf16>, tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %19 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%14 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %13) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%extracted_slice_1 = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x4xf16>
%19 = affine.apply affine_map<(d0) -> (d0 * 2 + 1)>(%arg0)
%extracted_slice_2 = tensor.extract_slice %arg2[0, 0, %19, %18] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x4xf16>
%20 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x2x4xf16>, vector<1x2x4xf16>
%21 = vector.transfer_read %extracted_slice_2[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x2x4xf16>, vector<1x2x4xf16>
%22:2 = scf.for %arg3 = %c0 to %c8 step %c4 iter_args(%arg4 = %20, %arg5 = %21) -> (vector<1x2x4xf16>, vector<1x2x4xf16>) {
%25:2 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg5, %arg8 = %arg4) -> (vector<1x2x4xf16>, vector<1x2x4xf16>) {
%26 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%arg0)[%arg6]
%27 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0 + 2)>(%arg0)[%arg6]
%28:2 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8, %arg11 = %arg7) -> (vector<1x2x4xf16>, vector<1x2x4xf16>) {
%29 = affine.apply affine_map<(d0)[s0] -> (d0 * 8 + s0)>(%arg1)[%arg9]
%extracted_slice_4 = tensor.extract_slice %11[0, %arg3, %26, %29] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x4x7xf16>
%extracted_slice_5 = tensor.extract_slice %12[0, %arg3, %arg6, %arg9] [2, 4, 1, 1] [1, 1, 1, 1] : tensor<2x8x3x3xf16> to tensor<2x4x1xf16>
%30 = vector.transfer_read %extracted_slice_4[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x4x7xf16>, vector<1x4x7xf16>
%31 = vector.transfer_read %extracted_slice_5[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<2x4x1xf16>, vector<2x4x1xf16>
%32 = vector.transpose %30, [0, 2, 1] : vector<1x4x7xf16> to vector<1x7x4xf16>
%33 = vector.transpose %31, [2, 1, 0] : vector<2x4x1xf16> to vector<1x4x2xf16>
%34 = vector.transpose %arg10, [0, 2, 1] : vector<1x2x4xf16> to vector<1x4x2xf16>
%35 = vector.extract_strided_slice %32 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%36 = vector.extract_strided_slice %32 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%37 = vector.extract_strided_slice %32 {offsets = [0, 4, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%38 = vector.extract_strided_slice %32 {offsets = [0, 6, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%39 = vector.extract %33[0] : vector<4x2xf16> from vector<1x4x2xf16>
%40 = vector.extract_strided_slice %34 {offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%41 = vector.extract_strided_slice %34 {offsets = [0, 1, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%42 = vector.extract_strided_slice %34 {offsets = [0, 2, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%43 = vector.extract_strided_slice %34 {offsets = [0, 3, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %35, %39, %40 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %36, %39, %41 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%46 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %37, %39, %42 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %38, %39, %43 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%48 = vector.insert_strided_slice %44, %34 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%49 = vector.insert_strided_slice %45, %48 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%50 = vector.insert_strided_slice %46, %49 {offsets = [0, 2, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%51 = vector.insert_strided_slice %47, %50 {offsets = [0, 3, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%52 = vector.transpose %51, [0, 2, 1] : vector<1x4x2xf16> to vector<1x2x4xf16>
%extracted_slice_6 = tensor.extract_slice %11[0, %arg3, %27, %29] [1, 4, 1, 7] [1, 1, 1, 1] : tensor<1x8x33x33xf16> to tensor<1x4x7xf16>
%53 = vector.transfer_read %extracted_slice_6[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x4x7xf16>, vector<1x4x7xf16>
%54 = vector.transpose %53, [0, 2, 1] : vector<1x4x7xf16> to vector<1x7x4xf16>
%55 = vector.transpose %arg11, [0, 2, 1] : vector<1x2x4xf16> to vector<1x4x2xf16>
%56 = vector.extract_strided_slice %54 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%57 = vector.extract_strided_slice %54 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%58 = vector.extract_strided_slice %54 {offsets = [0, 4, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%59 = vector.extract_strided_slice %54 {offsets = [0, 6, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%60 = vector.extract_strided_slice %55 {offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%61 = vector.extract_strided_slice %55 {offsets = [0, 1, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%62 = vector.extract_strided_slice %55 {offsets = [0, 2, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%63 = vector.extract_strided_slice %55 {offsets = [0, 3, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %39, %60 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %39, %61 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %39, %62 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%67 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %39, %63 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%68 = vector.insert_strided_slice %64, %55 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%69 = vector.insert_strided_slice %65, %68 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%70 = vector.insert_strided_slice %66, %69 {offsets = [0, 2, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%71 = vector.insert_strided_slice %67, %70 {offsets = [0, 3, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%72 = vector.transpose %71, [0, 2, 1] : vector<1x4x2xf16> to vector<1x2x4xf16>
scf.yield %52, %72 : vector<1x2x4xf16>, vector<1x2x4xf16>
}
scf.yield %28#1, %28#0 : vector<1x2x4xf16>, vector<1x2x4xf16>
}
scf.yield %25#1, %25#0 : vector<1x2x4xf16>, vector<1x2x4xf16>
}
%23 = vector.transfer_write %22#1, %extracted_slice_2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf16>, tensor<1x2x4xf16>
%24 = vector.transfer_write %22#0, %extracted_slice_1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf16>, tensor<1x2x4xf16>
%inserted_slice = tensor.insert_slice %23 into %extracted_slice[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x4xf16> into tensor<1x2x2x4xf16>
%inserted_slice_3 = tensor.insert_slice %24 into %inserted_slice[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : tensor<1x2x4xf16> into tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice_3 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%15 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xf16>> -> tensor<2xf16>
%16 = scf.forall (%arg0, %arg1) in (8, 4) shared_outs(%arg2 = %14) -> (tensor<1x2x16x16xf16>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x16x16xf16> to tensor<1x2x2x4xf16>
%19 = vector.transfer_read %15[%c0], %cst_0 {in_bounds = [true]} : tensor<2xf16>, vector<2xf16>
%20 = vector.broadcast %19 : vector<2xf16> to vector<1x2x4x2xf16>
%21 = vector.transpose %20, [0, 3, 1, 2] : vector<1x2x4x2xf16> to vector<1x2x2x4xf16>
%22 = vector.transfer_read %arg2[%c0, %c0, %17, %18], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x2x16x16xf16>, vector<1x2x2x4xf16>
%23 = arith.addf %22, %21 : vector<1x2x2x4xf16>
%24 = vector.transfer_write %23, %extracted_slice[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf16>, tensor<1x2x2x4xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg2[0, 0, %17, %18] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf16> into tensor<1x2x16x16xf16>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %16, %3, offsets = [%4, %5, %6, %7], sizes = [1, 2, 16, 16], strides = [1, 1, 1, 1] : tensor<1x2x16x16xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
return
}
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%cst = arith.constant dense<0.000000e+00> : vector<1x2x2x4xf16>
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x8x33x33xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2x8x33x33xf16, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x8x3x3xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<8x8x3x3xf16, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<8xf16, #hal.descriptor_type<storage_buffer>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<2x8x16x16xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %3, 64 : memref<2x8x16x16xf16, #hal.descriptor_type<storage_buffer>>
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%4 = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%workgroup_id_z]
%5 = affine.apply affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>()[%workgroup_id_z]
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%subview = memref.subview %3[%4, %5, %6, %7] [1, 2, 16, 16] [1, 1, 1, 1] : memref<2x8x16x16xf16, #hal.descriptor_type<storage_buffer>> to memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%subview_1 = memref.subview %0[%4, 0, %8, %9] [1, 8, 33, 33] [1, 1, 1, 1] : memref<2x8x33x33xf16, #hal.descriptor_type<storage_buffer>> to memref<1x8x33x33xf16, strided<[8712, 1089, 33, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %1[%5, 0, 0, 0] [2, 8, 3, 3] [1, 1, 1, 1] : memref<8x8x3x3xf16, #hal.descriptor_type<storage_buffer>> to memref<2x8x3x3xf16, strided<[72, 9, 3, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 4) {
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%subview_5 = memref.subview %subview[0, 0, %10, %11] [1, 2, 2, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
vector.transfer_write %cst, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf16>, memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview[0, 0, %10, %11] [1, 2, 2, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_5, %subview_6 : memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.forall (%arg0, %arg1) in (8, 4) {
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%subview_5 = memref.subview %subview[0, 0, %10, %11] [1, 2, 2, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview[0, 0, %10, %11] [1, 2, 1, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x2x4xf16, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %subview_6, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xf16, #gpu.address_space<workgroup>>
gpu.barrier
%12 = affine.apply affine_map<(d0) -> (d0 * 2 + 1)>(%arg0)
%subview_7 = memref.subview %subview[0, 0, %12, %11] [1, 2, 1, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc_8 = memref.alloc() : memref<1x2x4xf16, #gpu.address_space<workgroup>>
gpu.barrier
memref.copy %subview_7, %alloc_8 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xf16, #gpu.address_space<workgroup>>
gpu.barrier
%13 = vector.transfer_read %alloc[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x2x4xf16, #gpu.address_space<workgroup>>, vector<1x2x4xf16>
%14 = vector.transfer_read %alloc_8[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x2x4xf16, #gpu.address_space<workgroup>>, vector<1x2x4xf16>
%15:2 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %13, %arg4 = %14) -> (vector<1x2x4xf16>, vector<1x2x4xf16>) {
%16:2 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4, %arg7 = %arg3) -> (vector<1x2x4xf16>, vector<1x2x4xf16>) {
%17 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%arg0)[%arg5]
%18 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0 + 2)>(%arg0)[%arg5]
%19:2 = scf.for %arg8 = %c0 to %c3 step %c1 iter_args(%arg9 = %arg7, %arg10 = %arg6) -> (vector<1x2x4xf16>, vector<1x2x4xf16>) {
%20 = affine.apply affine_map<(d0)[s0] -> (d0 * 8 + s0)>(%arg1)[%arg8]
%subview_12 = memref.subview %subview_1[0, %arg2, %17, %20] [1, 4, 1, 7] [1, 1, 1, 1] : memref<1x8x33x33xf16, strided<[8712, 1089, 33, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4x7xf16, strided<[8712, 1089, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_13 = memref.subview %subview_2[0, %arg2, %arg5, %arg8] [2, 4, 1, 1] [1, 1, 1, 1] : memref<2x8x3x3xf16, strided<[72, 9, 3, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<2x4x1xf16, strided<[72, 9, 3], offset: ?>, #hal.descriptor_type<storage_buffer>>
%21 = vector.transfer_read %subview_12[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x4x7xf16, strided<[8712, 1089, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4x7xf16>
%22 = vector.transfer_read %subview_13[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<2x4x1xf16, strided<[72, 9, 3], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2x4x1xf16>
%23 = vector.transpose %21, [0, 2, 1] : vector<1x4x7xf16> to vector<1x7x4xf16>
%24 = vector.transpose %22, [2, 1, 0] : vector<2x4x1xf16> to vector<1x4x2xf16>
%25 = vector.transpose %arg9, [0, 2, 1] : vector<1x2x4xf16> to vector<1x4x2xf16>
%26 = vector.extract_strided_slice %23 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%27 = vector.extract_strided_slice %23 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%28 = vector.extract_strided_slice %23 {offsets = [0, 4, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%29 = vector.extract_strided_slice %23 {offsets = [0, 6, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%30 = vector.extract %24[0] : vector<4x2xf16> from vector<1x4x2xf16>
%31 = vector.extract_strided_slice %25 {offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%32 = vector.extract_strided_slice %25 {offsets = [0, 1, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%33 = vector.extract_strided_slice %25 {offsets = [0, 2, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%34 = vector.extract_strided_slice %25 {offsets = [0, 3, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%35 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %26, %30, %31 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%36 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %30, %32 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%37 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %30, %33 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%38 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %29, %30, %34 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%39 = vector.insert_strided_slice %35, %25 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%40 = vector.insert_strided_slice %36, %39 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%41 = vector.insert_strided_slice %37, %40 {offsets = [0, 2, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%42 = vector.insert_strided_slice %38, %41 {offsets = [0, 3, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%43 = vector.transpose %42, [0, 2, 1] : vector<1x4x2xf16> to vector<1x2x4xf16>
%subview_14 = memref.subview %subview_1[0, %arg2, %18, %20] [1, 4, 1, 7] [1, 1, 1, 1] : memref<1x8x33x33xf16, strided<[8712, 1089, 33, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4x7xf16, strided<[8712, 1089, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%44 = vector.transfer_read %subview_14[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x4x7xf16, strided<[8712, 1089, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4x7xf16>
%45 = vector.transpose %44, [0, 2, 1] : vector<1x4x7xf16> to vector<1x7x4xf16>
%46 = vector.transpose %arg10, [0, 2, 1] : vector<1x2x4xf16> to vector<1x4x2xf16>
%47 = vector.extract_strided_slice %45 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%48 = vector.extract_strided_slice %45 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%49 = vector.extract_strided_slice %45 {offsets = [0, 4, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%50 = vector.extract_strided_slice %45 {offsets = [0, 6, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x7x4xf16> to vector<1x1x4xf16>
%51 = vector.extract_strided_slice %46 {offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%52 = vector.extract_strided_slice %46 {offsets = [0, 1, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%53 = vector.extract_strided_slice %46 {offsets = [0, 2, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%54 = vector.extract_strided_slice %46 {offsets = [0, 3, 0], sizes = [1, 1, 2], strides = [1, 1, 1]} : vector<1x4x2xf16> to vector<1x1x2xf16>
%55 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %30, %51 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %30, %52 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %30, %53 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %50, %30, %54 : vector<1x1x4xf16>, vector<4x2xf16> into vector<1x1x2xf16>
%59 = vector.insert_strided_slice %55, %46 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%60 = vector.insert_strided_slice %56, %59 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%61 = vector.insert_strided_slice %57, %60 {offsets = [0, 2, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%62 = vector.insert_strided_slice %58, %61 {offsets = [0, 3, 0], strides = [1, 1, 1]} : vector<1x1x2xf16> into vector<1x4x2xf16>
%63 = vector.transpose %62, [0, 2, 1] : vector<1x4x2xf16> to vector<1x2x4xf16>
scf.yield %43, %63 : vector<1x2x4xf16>, vector<1x2x4xf16>
}
scf.yield %19#1, %19#0 : vector<1x2x4xf16>, vector<1x2x4xf16>
}
scf.yield %16#1, %16#0 : vector<1x2x4xf16>, vector<1x2x4xf16>
}
vector.transfer_write %15#1, %alloc_8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf16>, memref<1x2x4xf16, #gpu.address_space<workgroup>>
vector.transfer_write %15#0, %alloc[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf16>, memref<1x2x4xf16, #gpu.address_space<workgroup>>
%subview_9 = memref.subview %subview_5[0, 0, 1, 0] [1, 2, 1, 4] [1, 1, 1, 1] : memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc_8, %subview_9 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x2x4xf16, #gpu.address_space<workgroup>> to memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%subview_10 = memref.subview %subview_5[0, 0, 0, 0] [1, 2, 1, 4] [1, 1, 1, 1] : memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.copy %alloc, %subview_10 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<1x2x4xf16, #gpu.address_space<workgroup>> to memref<1x2x4xf16, strided<[2048, 256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
%subview_11 = memref.subview %subview[0, 0, %10, %11] [1, 2, 2, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_5, %subview_11 : memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%subview_3 = memref.subview %2[%5] [2] [1] : memref<8xf16, #hal.descriptor_type<storage_buffer>> to memref<2xf16, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 4) {
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1)
%subview_5 = memref.subview %subview[0, 0, %10, %11] [1, 2, 2, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%12 = vector.transfer_read %subview_3[%c0], %cst_0 {in_bounds = [true]} : memref<2xf16, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<2xf16>
%13 = vector.broadcast %12 : vector<2xf16> to vector<1x2x4x2xf16>
%14 = vector.transpose %13, [0, 3, 1, 2] : vector<1x2x4x2xf16> to vector<1x2x2x4xf16>
%15 = vector.transfer_read %subview[%c0, %c0, %10, %11], %cst_0 {in_bounds = [true, true, true, true]} : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x2x2x4xf16>
%16 = arith.addf %15, %14 : vector<1x2x2x4xf16>
vector.transfer_write %16, %subview_5[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf16>, memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %subview[0, 0, %10, %11] [1, 2, 2, 4] [1, 1, 1, 1] : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_5, %subview_6 : memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x2x4xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%subview_4 = memref.subview %3[%4, %5, %6, %7] [1, 2, 16, 16] [1, 1, 1, 1] : memref<2x8x16x16xf16, #hal.descriptor_type<storage_buffer>> to memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview, %subview_4 : memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x2x16x16xf16, strided<[2048, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment