Skip to content

Instantly share code, notes, and snippets.

@qedawkins
Created April 25, 2024 20:09
Show Gist options
  • Save qedawkins/953b4e9da86ad48c94b978323f2b39ae to your computer and use it in GitHub Desktop.
Save qedawkins/953b4e9da86ad48c94b978323f2b39ae to your computer and use it in GitHub Desktop.
[transform-dialect] Top-level payload:
func.func @main() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = linalg.matmul ins(%3, %4 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%c0_0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c4 = arith.constant 4 : index
%6 = scf.for %arg0 = %c0_0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_1 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[0, 0] [128, 128] [1, 1] : tensor<128x128xf32> to tensor<128x128xf32>
%7 = linalg.matmul ins(%extracted_slice, %extracted_slice_1 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%extracted_slice_2 : tensor<128x128xf32>) -> tensor<128x128xf32>
%inserted_slice = tensor.insert_slice %7 into %arg1[0, 0] [128, 128] [1, 1] : tensor<128x128xf32> into tensor<128x128xf32>
scf.yield %inserted_slice : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%arg1 : tensor<128x128xf32>) -> tensor<128x128xf32>
scf.yield %7 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%arg1 : tensor<128x128xf32>) -> tensor<128x128xf32>
scf.yield %7 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%arg1 : tensor<128x128xf32>) -> tensor<128x128xf32>
scf.yield %7 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%arg1 : tensor<128x128xf32>) -> tensor<128x128xf32>
scf.yield %7 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = tensor.empty() : tensor<128x4xf32>
%8 = linalg.copy ins(%extracted_slice : tensor<128x4xf32>) outs(%7 : tensor<128x4xf32>) -> tensor<128x4xf32>
%9 = linalg.matmul ins(%8, %extracted_slice_0 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%arg1 : tensor<128x128xf32>) -> tensor<128x128xf32>
scf.yield %9 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = tensor.empty() : tensor<128x4xf32>
%8 = linalg.copy ins(%extracted_slice : tensor<128x4xf32>) outs(%7 : tensor<128x4xf32>) -> tensor<128x4xf32>
%9 = tensor.empty() : tensor<4x128xf32>
%10 = linalg.copy ins(%extracted_slice_0 : tensor<4x128xf32>) outs(%9 : tensor<4x128xf32>) -> tensor<4x128xf32>
%11 = linalg.matmul ins(%8, %10 : tensor<128x4xf32>, tensor<4x128xf32>) outs(%arg1 : tensor<128x128xf32>) -> tensor<128x128xf32>
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = tensor.empty() : tensor<128x4xf32>
%8 = linalg.copy ins(%extracted_slice : tensor<128x4xf32>) outs(%7 : tensor<128x4xf32>) -> tensor<128x4xf32>
%9 = tensor.empty() : tensor<4x128xf32>
%10 = linalg.copy ins(%extracted_slice_0 : tensor<4x128xf32>) outs(%9 : tensor<4x128xf32>) -> tensor<4x128xf32>
%c8 = arith.constant 8 : index
%c8_1 = arith.constant 8 : index
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%16 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_2 = tensor.extract_slice %8[%14, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_3 = tensor.extract_slice %10[0, %15] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_4 = tensor.extract_slice %arg4[%16, %17] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%18 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_4 : tensor<16x16xf32>) -> tensor<16x16xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg4[%19, %20] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = tensor.empty() : tensor<128x4xf32>
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%8 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %7) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_2 = tensor.extract_slice %extracted_slice[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%16, %17] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%18 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_3 : tensor<2x4xf32>) -> tensor<2x4xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg4[%19, %20] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%9 = tensor.empty() : tensor<4x128xf32>
%10 = linalg.copy ins(%extracted_slice_0 : tensor<4x128xf32>) outs(%9 : tensor<4x128xf32>) -> tensor<4x128xf32>
%c8 = arith.constant 8 : index
%c8_1 = arith.constant 8 : index
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%16 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_2 = tensor.extract_slice %8[%14, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_3 = tensor.extract_slice %10[0, %15] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_4 = tensor.extract_slice %arg4[%16, %17] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%18 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_4 : tensor<16x16xf32>) -> tensor<16x16xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg4[%19, %20] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = tensor.empty() : tensor<128x4xf32>
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%8 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %7) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_2 = tensor.extract_slice %extracted_slice[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%16, %17] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%18 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_3 : tensor<2x4xf32>) -> tensor<2x4xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg4[%19, %20] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%9 = tensor.empty() : tensor<4x128xf32>
%c2 = arith.constant 2 : index
%c32 = arith.constant 32 : index
%10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %9) -> (tensor<4x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_2 = tensor.extract_slice %extracted_slice_0[%14, %15] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%16, %17] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%18 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_3 : tensor<2x4xf32>) -> tensor<2x4xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg4[%19, %20] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%c8 = arith.constant 8 : index
%c8_1 = arith.constant 8 : index
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%16 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_2 = tensor.extract_slice %8[%14, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_3 = tensor.extract_slice %10[0, %15] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_4 = tensor.extract_slice %arg4[%16, %17] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%18 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_4 : tensor<16x16xf32>) -> tensor<16x16xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg4[%19, %20] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%7 = tensor.empty() : tensor<128x4xf32>
%8 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %7) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%16 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%17, %18] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%9 = tensor.empty() : tensor<4x128xf32>
%10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %9) -> (tensor<4x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice_0[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%14, %15] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%16 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%17, %18] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %8[%12, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_2 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%14, %15] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%16 = linalg.matmul ins(%extracted_slice_1, %extracted_slice_2 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%18 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%17, %18] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%16 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%17, %18] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice_0[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%14, %15] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%16 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%17, %18] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_2 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%14, %15] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%16 = linalg.matmul ins(%extracted_slice_1, %extracted_slice_2 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%18 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%17, %18] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%14 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice_0[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%14 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_2 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%14 = linalg.matmul ins(%extracted_slice_1, %extracted_slice_2 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%extracted_slice = tensor.extract_slice %3[0, %arg0] [128, 4] [1, 1] : tensor<128x128xf32> to tensor<128x4xf32>
%extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [4, 128] [1, 1] : tensor<128x128xf32> to tensor<4x128xf32>
%9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%14 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %extracted_slice_0[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%14 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice_1 = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_2 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%14 = linalg.matmul ins(%extracted_slice_1, %extracted_slice_2 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%15 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%arg3)[%arg0]
%extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%16 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%arg2)[%arg0]
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice = tensor.extract_slice %4[%14, %15] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%16 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%extracted_slice_0 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %11 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf32>) {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%13 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%arg2)[%arg0]
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
%extracted_slice = tensor.extract_slice %4[%13, %14] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[%11, %12] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%15 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %15 into %arg4[%11, %12] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
%10 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%11 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%13 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%14:2 = affine.delinearize_index %13 into (%c64, %c1) : index, index
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%14#0)
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%14#1)
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%14#0)
%18 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%14#1)[%arg0]
%extracted_slice = tensor.extract_slice %3[%17, %18] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %6[%15, %16] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%19 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%20 = iree_gpu.shuffle_tensor %19[%15, %16] [2, 4] [1, 1] to %6 [%11, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%extracted_slice_1 = tensor.extract_slice %9[0, %12] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg4[%11, %12] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%21 = linalg.matmul ins(%20, %extracted_slice_1 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_2 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg4[%11, %12] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %10 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%11 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%17 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg0]
%extracted_slice = tensor.extract_slice %3[%16, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%18 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%19 = iree_gpu.shuffle_tensor %18[%14, %15] [2, 4] [1, 1] to %6 [%10, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%20 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%c2 = arith.constant 2 : index
%c32 = arith.constant 32 : index
%21:2 = affine.delinearize_index %20 into (%c2, %c32) : index, index
%22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%21#0)
%23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%21#1)
%24 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%21#0)[%arg0]
%25 = affine.apply affine_map<(d0) -> (d0 * 4)>(%21#1)
%extracted_slice_1 = tensor.extract_slice %4[%24, %25] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %7[%22, %23] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%26 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%27 = iree_gpu.shuffle_tensor %26[%22, %23] [2, 4] [1, 1] to %7 [0, %11] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%10, %11] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%28 = linalg.matmul ins(%19, %27 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %28 into %arg4[%10, %11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %9 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%11 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%17 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg0]
%extracted_slice = tensor.extract_slice %3[%16, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%18 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%19 = iree_gpu.shuffle_tensor %18[%14, %15] [2, 4] [1, 1] to %6 [%10, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%20 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%21:2 = affine.delinearize_index %20 into (%c2, %c32) : index, index
%22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%21#0)
%23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%21#1)
%24 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%21#0)[%arg0]
%25 = affine.apply affine_map<(d0) -> (d0 * 4)>(%21#1)
%extracted_slice_1 = tensor.extract_slice %4[%24, %25] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %7[%22, %23] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%26 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%27 = iree_gpu.shuffle_tensor %26[%22, %23] [2, 4] [1, 1] to %7 [0, %11] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%10, %11] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%28 = linalg.matmul ins(%19, %27 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %28 into %arg4[%10, %11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %9 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%11 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%17 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg0]
%extracted_slice = tensor.extract_slice %3[%16, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%18 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%19 = iree_gpu.shuffle_tensor %18[%14, %15] [2, 4] [1, 1] to %6 [%10, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%20 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%21:2 = affine.delinearize_index %20 into (%c2, %c32) : index, index
%22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%21#0)
%23 = affine.apply affine_map<(d0) -> (d0 * 4)>(%21#1)
%24 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%21#0)[%arg0]
%25 = affine.apply affine_map<(d0) -> (d0 * 4)>(%21#1)
%extracted_slice_1 = tensor.extract_slice %4[%24, %25] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %7[%22, %23] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%26 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%27 = iree_gpu.shuffle_tensor %26[%22, %23] [2, 4] [1, 1] to %7 [0, %11] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%10, %11] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%28 = linalg.matmul ins(%19, %27 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %28 into %arg4[%10, %11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %9 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%11 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg0]
%extracted_slice = tensor.extract_slice %3[%14, %16] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%17 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%18 = iree_gpu.shuffle_tensor %17[%14, %15] [2, 4] [1, 1] to %6 [%10, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%19:2 = affine.delinearize_index %12 into (%c2, %c32) : index, index
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%19#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%19#1)
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%19#0)[%arg0]
%extracted_slice_1 = tensor.extract_slice %4[%22, %21] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %7[%20, %21] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%20, %21] [2, 4] [1, 1] to %7 [0, %11] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%10, %11] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%25 = linalg.matmul ins(%18, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg4[%10, %11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %9 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) {
%9 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) {
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
%11 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg3)
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg2, %arg3)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg0]
%extracted_slice = tensor.extract_slice %3[%14, %16] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_0 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%17 = linalg.copy ins(%extracted_slice : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%18 = iree_gpu.shuffle_tensor %17[%14, %15] [2, 4] [1, 1] to %6 [%10, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%19:2 = affine.delinearize_index %12 into (%c2, %c32) : index, index
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%19#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%19#1)
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%19#0)[%arg0]
%extracted_slice_1 = tensor.extract_slice %4[%22, %21] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %7[%20, %21] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%20, %21] [2, 4] [1, 1] to %7 [0, %11] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[%10, %11] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%25 = linalg.matmul ins(%18, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_3 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %25 into %arg4[%10, %11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
scf.yield %9 : tensor<128x128xf32>
}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%11 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %arg2) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg3]
%extracted_slice_0 = tensor.extract_slice %3[%14, %16] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_1 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%17 = linalg.copy ins(%extracted_slice_0 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%18 = iree_gpu.shuffle_tensor %17[%14, %15] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%19:2 = affine.delinearize_index %12 into (%c2, %c32) : index, index
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%19#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%19#1)
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%19#0)[%arg3]
%extracted_slice_2 = tensor.extract_slice %4[%22, %21] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_3 = tensor.extract_slice %7[%20, %21] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_3 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%20, %21] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_4 = tensor.extract_slice %arg4[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%25 = linalg.matmul ins(%18, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_4 : tensor<16x16xf32>) -> tensor<16x16xf32>
%inserted_slice = tensor.insert_slice %25 into %arg4[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
scf.yield %inserted_slice : tensor<128x128xf32>
}
%extracted_slice = tensor.extract_slice %11[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %extracted_slice into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%11 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %arg2) -> (tensor<128x128xf32>) {
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg3]
%extracted_slice_0 = tensor.extract_slice %3[%14, %16] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_1 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%17 = linalg.copy ins(%extracted_slice_0 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%18 = iree_gpu.shuffle_tensor %17[%14, %15] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%19:2 = affine.delinearize_index %12 into (%c2, %c32) : index, index
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%19#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%19#1)
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%19#0)[%arg3]
%extracted_slice_2 = tensor.extract_slice %4[%22, %21] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_3 = tensor.extract_slice %7[%20, %21] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_3 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%20, %21] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%extracted_slice_4 = tensor.extract_slice %arg4[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%25 = linalg.matmul ins(%18, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%extracted_slice_4 : tensor<16x16xf32>) -> tensor<16x16xf32>
%inserted_slice = tensor.insert_slice %25 into %arg4[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
scf.yield %inserted_slice : tensor<128x128xf32>
}
%extracted_slice = tensor.extract_slice %11[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %extracted_slice into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11:2 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %arg2, %arg5 = %extracted_slice) -> (tensor<128x128xf32>, tensor<16x16xf32>) {
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg3]
%extracted_slice_1 = tensor.extract_slice %3[%14, %16] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_2 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%17 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf32>) outs(%extracted_slice_2 : tensor<2x4xf32>) -> tensor<2x4xf32>
%18 = iree_gpu.shuffle_tensor %17[%14, %15] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%19:2 = affine.delinearize_index %12 into (%c2, %c32) : index, index
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%19#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%19#1)
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%19#0)[%arg3]
%extracted_slice_3 = tensor.extract_slice %4[%22, %21] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_4 = tensor.extract_slice %7[%20, %21] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_3 : tensor<2x4xf32>) outs(%extracted_slice_4 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%20, %21] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%25 = linalg.matmul ins(%18, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%arg5 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.yield %arg4, %25 : tensor<128x128xf32>, tensor<16x16xf32>
}
%inserted_slice = tensor.insert_slice %11#1 into %11#0[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
%extracted_slice_0 = tensor.extract_slice %inserted_slice[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %extracted_slice_0 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%12 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%13:2 = affine.delinearize_index %12 into (%c64, %c1) : index, index
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%13#1)[%arg3]
%extracted_slice_0 = tensor.extract_slice %3[%14, %16] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_1 = tensor.extract_slice %6[%14, %15] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%17 = linalg.copy ins(%extracted_slice_0 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%18 = iree_gpu.shuffle_tensor %17[%14, %15] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%19:2 = affine.delinearize_index %12 into (%c2, %c32) : index, index
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%19#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%19#1)
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%19#0)[%arg3]
%extracted_slice_2 = tensor.extract_slice %4[%22, %21] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%extracted_slice_3 = tensor.extract_slice %7[%20, %21] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_3 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%20, %21] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%25 = linalg.matmul ins(%18, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%arg4 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.yield %25 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %11 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%extracted_slice_0 = tensor.extract_slice %6[%13, %14] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%15:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%15#0)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#1)
%extracted_slice_1 = tensor.extract_slice %7[%16, %17] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%18 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%19 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%extracted_slice_2 = tensor.extract_slice %3[%13, %19] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%20 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%21 = iree_gpu.shuffle_tensor %20[%13, %14] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%15#0)[%arg3]
%extracted_slice_3 = tensor.extract_slice %4[%22, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_3 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%16, %17] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%25 = linalg.matmul ins(%21, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%arg4 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.yield %25 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%extracted_slice_0 = tensor.extract_slice %6[%13, %14] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%15:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%15#0)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#1)
%extracted_slice_1 = tensor.extract_slice %7[%16, %17] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%18 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%19 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%extracted_slice_2 = tensor.extract_slice %3[%13, %19] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%20 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%21 = iree_gpu.shuffle_tensor %20[%13, %14] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%15#0)[%arg3]
%extracted_slice_3 = tensor.extract_slice %4[%22, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_3 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%16, %17] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%25 = linalg.matmul ins(%21, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%arg4 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.yield %25 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%extracted_slice_0 = tensor.extract_slice %6[%13, %14] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%15:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%15#0)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#1)
%extracted_slice_1 = tensor.extract_slice %7[%16, %17] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%18 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%19 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%extracted_slice_2 = tensor.extract_slice %3[%13, %19] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%20 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%21 = iree_gpu.shuffle_tensor %20[%13, %14] [2, 4] [1, 1] to %6 [%9, 0] [16, 4] [1, 1] : tensor<2x4xf32> -> tensor<128x4xf32> -> tensor<16x4xf32>
%22 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%15#0)[%arg3]
%extracted_slice_3 = tensor.extract_slice %4[%22, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%23 = linalg.copy ins(%extracted_slice_3 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%24 = iree_gpu.shuffle_tensor %23[%16, %17] [2, 4] [1, 1] to %7 [0, %10] [4, 16] [1, 1] : tensor<2x4xf32> -> tensor<4x128xf32> -> tensor<4x16xf32>
%25 = linalg.matmul ins(%21, %24 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%arg4 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.yield %25 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%extracted_slice_0 = tensor.extract_slice %6[%13, %14] [2, 4] [1, 1] : tensor<128x4xf32> to tensor<2x4xf32>
%15:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%15#0)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%15#1)
%extracted_slice_1 = tensor.extract_slice %7[%16, %17] [2, 4] [1, 1] : tensor<4x128xf32> to tensor<2x4xf32>
%18 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%19 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%extracted_slice_2 = tensor.extract_slice %3[%13, %19] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%20 = linalg.copy ins(%extracted_slice_2 : tensor<2x4xf32>) outs(%extracted_slice_0 : tensor<2x4xf32>) -> tensor<2x4xf32>
%inserted_slice = tensor.insert_slice %20 into %6[%13, %14] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<128x4xf32>
gpu.barrier
%extracted_slice_3 = tensor.extract_slice %inserted_slice[%9, 0] [16, 4] [1, 1] : tensor<128x4xf32> to tensor<16x4xf32>
%21 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%15#0)[%arg3]
%extracted_slice_4 = tensor.extract_slice %4[%21, %17] [2, 4] [1, 1] : tensor<128x128xf32> to tensor<2x4xf32>
%22 = linalg.copy ins(%extracted_slice_4 : tensor<2x4xf32>) outs(%extracted_slice_1 : tensor<2x4xf32>) -> tensor<2x4xf32>
%inserted_slice_5 = tensor.insert_slice %22 into %7[%16, %17] [2, 4] [1, 1] : tensor<2x4xf32> into tensor<4x128xf32>
gpu.barrier
%extracted_slice_6 = tensor.extract_slice %inserted_slice_5[0, %10] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32>
%23 = linalg.matmul ins(%extracted_slice_3, %extracted_slice_6 : tensor<16x4xf32>, tensor<4x16xf32>) outs(%arg4 : tensor<16x16xf32>) -> tensor<16x16xf32>
scf.yield %23 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %18 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%17 = vector.transfer_read %3[%15, %16], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%18 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%20 = vector.transfer_write %17, %6[%18, %19] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%21 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%23 = vector.transfer_read %4[%21, %22], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%24 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%25 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%26 = vector.transfer_write %23, %7[%24, %25] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%27 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%28 = vector.transfer_read %20[%27, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%29 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%30 = vector.transfer_read %26[%c0, %29], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%31 = vector.transfer_read %arg4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%32 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %30, %31 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
%33 = vector.transfer_write %32, %arg4[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.yield %33 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice) -> (tensor<16x16xf32>) {
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%17 = vector.transfer_read %3[%15, %16], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%18 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%19 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%20 = vector.transfer_write %17, %6[%18, %19] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%21 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%23 = vector.transfer_read %4[%21, %22], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%24 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%25 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%26 = vector.transfer_write %23, %7[%24, %25] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%27 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%28 = vector.transfer_read %20[%27, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%29 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%30 = vector.transfer_read %26[%c0, %29], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%31 = vector.transfer_read %arg4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%32 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %30, %31 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
%33 = vector.transfer_write %32, %arg4[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.yield %33 : tensor<16x16xf32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %14 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%15:2 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %extracted_slice, %arg5 = %14) -> (tensor<16x16xf32>, vector<16x16xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%18 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%19 = vector.transfer_read %3[%17, %18], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%22 = vector.transfer_write %19, %6[%20, %21] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%23 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%24 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%25 = vector.transfer_read %4[%23, %24], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%26 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%27 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%28 = vector.transfer_write %25, %7[%26, %27] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%29 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%30 = vector.transfer_read %22[%29, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%31 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%32 = vector.transfer_read %28[%c0, %31], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %32, %arg5 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %arg4, %33 : tensor<16x16xf32>, vector<16x16xf32>
}
%16 = vector.transfer_write %15#1, %15#0[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%15 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %14) -> (vector<16x16xf32>) {
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%18 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%19 = vector.transfer_read %3[%17, %18], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%22 = vector.transfer_write %19, %6[%20, %21] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%23 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%24 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%25 = vector.transfer_read %4[%23, %24], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%26 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%27 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%28 = vector.transfer_write %25, %7[%26, %27] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%29 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%30 = vector.transfer_read %22[%29, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%31 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%32 = vector.transfer_read %28[%c0, %31], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %32, %arg4 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %33 : vector<16x16xf32>
}
%16 = vector.transfer_write %15, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %16 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%16 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%20 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%22 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%23 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %14) -> (vector<16x16xf32>) {
%25 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%26 = vector.transfer_read %3[%15, %25], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%27 = vector.transfer_write %26, %6[%16, %17] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%28 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%29 = vector.transfer_read %4[%28, %18], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%30 = vector.transfer_write %29, %7[%19, %20] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%31 = vector.transfer_read %27[%21, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%32 = vector.transfer_read %30[%c0, %22], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%33 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %31, %32, %arg4 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %33 : vector<16x16xf32>
}
%24 = vector.transfer_write %23, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %24 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%18 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%19 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %14) -> (vector<16x16xf32>) {
%21 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%22 = vector.transfer_read %3[%15, %21], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%23 = vector.transfer_write %22, %6[%15, %16] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%24 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%25 = vector.transfer_read %4[%24, %17], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%26 = vector.transfer_write %25, %7[%18, %17] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%27 = vector.transfer_read %23[%9, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%28 = vector.transfer_read %26[%c0, %10], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %28, %arg4 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %29 : vector<16x16xf32>
}
%20 = vector.transfer_write %19, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%18 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%19 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %14) -> (vector<16x16xf32>) {
%21 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%22 = vector.transfer_read %3[%15, %21], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%23 = vector.transfer_write %22, %6[%15, %16] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%24 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%25 = vector.transfer_read %4[%24, %17], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%26 = vector.transfer_write %25, %7[%18, %17] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%27 = vector.transfer_read %23[%9, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%28 = vector.transfer_read %26[%c0, %10], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %28, %arg4 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %29 : vector<16x16xf32>
}
%20 = vector.transfer_write %19, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x128xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128xf32>> -> tensor<128x128xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<128x128xf32>> -> tensor<128x128xf32>
%6 = tensor.empty() : tensor<128x4xf32>
%7 = tensor.empty() : tensor<4x128xf32>
%8 = scf.forall (%arg0, %arg1) in (8, 8) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%extracted_slice = tensor.extract_slice %arg2[%9, %10] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32>
%11 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%12:2 = affine.delinearize_index %11 into (%c64, %c1) : index, index
%13:2 = affine.delinearize_index %11 into (%c2, %c32) : index, index
%14 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32>
%15 = affine.apply affine_map<(d0) -> (d0 * 2)>(%12#0)
%16 = affine.apply affine_map<(d0) -> (d0 * 4)>(%12#1)
%17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%13#1)
%18 = affine.apply affine_map<(d0) -> (d0 * 2)>(%13#0)
%19 = scf.for %arg3 = %c0 to %c128 step %c4 iter_args(%arg4 = %14) -> (vector<16x16xf32>) {
%21 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%12#1)[%arg3]
%22 = vector.transfer_read %3[%15, %21], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%23 = vector.transfer_write %22, %6[%15, %16] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<128x4xf32>
gpu.barrier
%24 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%13#0)[%arg3]
%25 = vector.transfer_read %4[%24, %17], %cst {in_bounds = [true, true]} : tensor<128x128xf32>, vector<2x4xf32>
%26 = vector.transfer_write %25, %7[%18, %17] {in_bounds = [true, true]} : vector<2x4xf32>, tensor<4x128xf32>
gpu.barrier
%27 = vector.transfer_read %23[%9, %c0], %cst {in_bounds = [true, true]} : tensor<128x4xf32>, vector<16x4xf32>
%28 = vector.transfer_read %26[%c0, %10], %cst {in_bounds = [true, true]} : tensor<4x128xf32>, vector<4x16xf32>
%29 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %28, %arg4 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %29 : vector<16x16xf32>
}
%20 = vector.transfer_write %19, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %20 into %arg2[%9, %10] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32>
}
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<readwrite:tensor<128x128xf32>>
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 8) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%subview = memref.subview %2[%3, %4] [16, 16] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%13 = scf.for %arg2 = %c0 to %c128 step %c4 iter_args(%arg3 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg2]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg2]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg3 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 8) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%subview = memref.subview %2[%3, %4] [16, 16] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%13 = scf.for %arg2 = %c0 to %c128 step %c4 iter_args(%arg3 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg2]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg2]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg3 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 8) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%subview = memref.subview %2[%3, %4] [16, 16] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%13 = scf.for %arg2 = %c0 to %c128 step %c4 iter_args(%arg3 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg2]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg2]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg3 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 8) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%subview = memref.subview %2[%3, %4] [16, 16] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%13 = scf.for %arg2 = %c0 to %c128 step %c4 iter_args(%arg3 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg2]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg2]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg3 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
return
}[transform-dialect] Top-level payload:
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (8, 8) {
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg0)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%subview = memref.subview %2[%3, %4] [16, 16] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%arg0, %arg1)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%13 = scf.for %arg2 = %c0 to %c128 step %c4 iter_args(%arg3 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg2]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg2]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg3 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c0_0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%thread_id_y)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%thread_id_x)
%subview = memref.subview %2[%3, %4] [16, 16] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%thread_id_y, %thread_id_x)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %subview[%c0_0, %c0_0], %cst {in_bounds = [true, true]} : memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%13 = scf.for %arg0 = %c0_0 to %c128 step %c4 iter_args(%arg1 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg0]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg0]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_1[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0_0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_1[%c0_0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %subview[%c0_0, %c0_0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%thread_id_y)
%4 = affine.apply affine_map<(d0) -> (d0 * 16)>(%thread_id_x)
%5 = affine.apply affine_map<(d0, d1) -> (d0 * 8 + d1)>(%thread_id_y, %thread_id_x)
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%10 = vector.transfer_read %2[%8, %9], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%6#0)
%12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%6#1)
%13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%7#1)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%7#0)
%15 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %10) -> (vector<16x16xf32>) {
%18 = affine.apply affine_map<(d0)[s0] -> (d0 * 4 + s0)>(%6#1)[%arg0]
%19 = vector.transfer_read %0[%11, %18], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %19, %alloc[%11, %12] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%20 = affine.apply affine_map<(d0)[s0] -> (d0 * 2 + s0)>(%7#0)[%arg0]
%21 = vector.transfer_read %1[%20, %13], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %21, %alloc_0[%14, %13] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%22 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%23 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %24 : vector<16x16xf32>
}
%16 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%17 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
vector.transfer_write %15, %2[%16, %17] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%10 = vector.transfer_read %2[%8, %9], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%11 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%14 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%15 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %10) -> (vector<16x16xf32>) {
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%19 = vector.transfer_read %0[%11, %18], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %19, %alloc[%11, %12] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%21 = vector.transfer_read %1[%20, %13], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %21, %alloc_0[%14, %13] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%22 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%23 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %24 : vector<16x16xf32>
}
%16 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%17 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
vector.transfer_write %15, %2[%16, %17] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%9 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%10 = vector.transfer_read %2[%8, %9], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%11 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%12 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%14 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%15 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %10) -> (vector<16x16xf32>) {
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%19 = vector.transfer_read %0[%11, %18], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %19, %alloc[%11, %12] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%21 = vector.transfer_read %1[%20, %13], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %21, %alloc_0[%14, %13] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%22 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%23 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %24 : vector<16x16xf32>
}
%16 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%17 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
vector.transfer_write %15, %2[%16, %17] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %2[%3, %4], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%12 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%13 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %2[%3, %4], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%12 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%13 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
vector.transfer_write %15, %alloc[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
vector.transfer_write %17, %alloc_0[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc_0[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%alloc = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %2[%3, %4], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%12 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%13 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
vector.transfer_write %15, %alloc_0[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
vector.transfer_write %17, %alloc[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc_0[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.dealloc %alloc_0 : memref<128x4xf32, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<4x128xf32, #gpu.address_space<workgroup>>
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%alloc = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %2[%3, %4], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%12 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%13 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
vector.transfer_write %15, %alloc_0[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
vector.transfer_write %17, %alloc[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc_0[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.dealloc %alloc_0 : memref<128x4xf32, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<4x128xf32, #gpu.address_space<workgroup>>
return
}[transform-dialect] Top-level payload:
func.func @main() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [8, 8, 1] subgroup_size = 32>} {
%alloc = memref.alloc() {alignment = 64 : i64} : memref<4x128xf32, #gpu.address_space<workgroup>>
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<128x4xf32, #gpu.address_space<workgroup>>
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_x]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * 8 + s1)>()[%thread_id_y, %thread_id_x]
%6:2 = affine.delinearize_index %5 into (%c64, %c1) : index, index
%7:2 = affine.delinearize_index %5 into (%c2, %c32) : index, index
%8 = vector.transfer_read %2[%3, %4], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<16x16xf32>
%9 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%6#0]
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%6#1]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7#1]
%12 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%7#0]
%13 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %8) -> (vector<16x16xf32>) {
%14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4)>()[%arg0, %6#1]
%15 = vector.transfer_read %0[%9, %14], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
vector.transfer_write %15, %alloc_0[%9, %10] {in_bounds = [true, true]} : vector<2x4xf32>, memref<128x4xf32, #gpu.address_space<workgroup>>
gpu.barrier
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%arg0, %7#0]
%17 = vector.transfer_read %1[%16, %11], %cst {in_bounds = [true, true]} : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>, vector<2x4xf32>
vector.transfer_write %17, %alloc[%12, %11] {in_bounds = [true, true]} : vector<2x4xf32>, memref<4x128xf32, #gpu.address_space<workgroup>>
gpu.barrier
%18 = vector.transfer_read %alloc_0[%3, %c0], %cst {in_bounds = [true, true]} : memref<128x4xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
%19 = vector.transfer_read %alloc[%c0, %4], %cst {in_bounds = [true, true]} : memref<4x128xf32, #gpu.address_space<workgroup>>, vector<4x16xf32>
%20 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %19, %arg1 : vector<16x4xf32>, vector<4x16xf32> into vector<16x16xf32>
scf.yield %20 : vector<16x16xf32>
}
vector.transfer_write %13, %2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<128x128xf32, #hal.descriptor_type<storage_buffer>>
gpu.barrier
memref.dealloc %alloc_0 : memref<128x4xf32, #gpu.address_space<workgroup>>
memref.dealloc %alloc : memref<4x128xf32, #gpu.address_space<workgroup>>
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment