Max191/pad_winograd.mlir

## pad_winograd.mlir
module {
  func.func @main$async_dispatch_1_winograd_input_transform_11x11x16x8x8xf32() attributes {translation_info = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>} {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %c262144 = arith.constant 262144 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c262144) : !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
    %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> -> tensor<16x64x64xf32>
    %3 = tensor.empty() : tensor<11x11x16x8x8xbf16>
    %4 = tensor.empty() : tensor<11x11x16x8x8xf32>
    %padded = tensor.pad %2 low[0, 1, 1] high[0, 1, 1] {
    ^bb0(%arg0: index, %arg1: index, %arg2: index):
      tensor.yield %cst : f32
    } {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[2, 0, 0], [1, 8, 8], [0, 0, 0], [0, 0, 0]]>} : tensor<16x64x64xf32> to tensor<16x66x66xf32>
    %5 = iree_linalg_ext.winograd.input_transform {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 2, 0, 0], [1, 1, 1, 8, 8]]>} output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) input_tile_dimensions([3, 4]) ins(%padded : tensor<16x66x66xf32>) outs(%4 : tensor<11x11x16x8x8xf32>) -> tensor<11x11x16x8x8xf32>
    %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<11x11x16x8x8xf32>) outs(%3 : tensor<11x11x16x8x8xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 2, 0, 0], [1, 1, 1, 8, 8], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} {
    ^bb0(%in: f32, %out: bf16):
      %7 = arith.truncf %in : f32 to bf16
      linalg.yield %7 : bf16
    } -> tensor<11x11x16x8x8xbf16>
    flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0, 0], sizes = [11, 11, 16, 8, 8], strides = [1, 1, 1, 1, 1] : tensor<11x11x16x8x8xbf16> -> !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
    return
  }
}
	module {
	func.func @main$async_dispatch_1_winograd_input_transform_11x11x16x8x8xf32() attributes {translation_info = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>} {
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%c262144 = arith.constant 262144 : index
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>>
	%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c262144) : !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
	%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> -> tensor<16x64x64xf32>
	%3 = tensor.empty() : tensor<11x11x16x8x8xbf16>
	%4 = tensor.empty() : tensor<11x11x16x8x8xf32>
	%padded = tensor.pad %2 low[0, 1, 1] high[0, 1, 1] {
	^bb0(%arg0: index, %arg1: index, %arg2: index):
	tensor.yield %cst : f32
	} {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[2, 0, 0], [1, 8, 8], [0, 0, 0], [0, 0, 0]]>} : tensor<16x64x64xf32> to tensor<16x66x66xf32>
	%5 = iree_linalg_ext.winograd.input_transform {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 2, 0, 0], [1, 1, 1, 8, 8]]>} output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) input_tile_dimensions([3, 4]) ins(%padded : tensor<16x66x66xf32>) outs(%4 : tensor<11x11x16x8x8xf32>) -> tensor<11x11x16x8x8xf32>
	%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<11x11x16x8x8xf32>) outs(%3 : tensor<11x11x16x8x8xbf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 2, 0, 0], [1, 1, 1, 8, 8], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} {
	^bb0(%in: f32, %out: bf16):
	%7 = arith.truncf %in : f32 to bf16
	linalg.yield %7 : bf16
	} -> tensor<11x11x16x8x8xbf16>
	flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0, 0], sizes = [11, 11, 16, 8, 8], strides = [1, 1, 1, 1, 1] : tensor<11x11x16x8x8xbf16> -> !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
	return
	}
	}