Created
December 5, 2024 14:05
-
-
Save Yu-Zhewen/5f569b56c7b1f1a8715a7c4c3bf9e609 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Before Optimization: | |
---------------------------------------------------------------------------------------------------------------------------- | |
---------------------------------------------------------------------------------------------------------------------------- | |
%bd_id = amdaie.bd_id(%tile_71, %c0) | |
amdaie.npu.half_dma_cpy_nd %138(%11 [] [] [] channel = %channel_368) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%187 = amdaie.npu.half_dma_cpy_nd async %138(%150 [0, 0, 96, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id channel = %channel_369) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_370 = amdaie.bd_id(%tile_68, %c0) | |
amdaie.npu.half_dma_cpy_nd %135(%10 [] [] [] channel = %channel_366) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%188 = amdaie.npu.half_dma_cpy_nd async %135(%149 [0, 0, 64, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id_370 channel = %channel_367) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_371 = amdaie.bd_id(%tile_65, %c0) | |
amdaie.npu.half_dma_cpy_nd %132(%9 [] [] [] channel = %channel_364) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%189 = amdaie.npu.half_dma_cpy_nd async %132(%148 [0, 0, 32, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id_371 channel = %channel_365) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_372 = amdaie.bd_id(%tile_63, %c0) | |
amdaie.npu.half_dma_cpy_nd %129(%8 [] [] [] channel = %channel_362) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%190 = amdaie.npu.half_dma_cpy_nd async %129(%147 [0, 0, 0, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id_372 channel = %channel_363) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_373 = amdaie.bd_id(%tile_71, %c1) | |
%191 = amdaie.npu.half_dma_cpy_nd async %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_373 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_374 = amdaie.bd_id(%tile_68, %c1) | |
%192 = amdaie.npu.half_dma_cpy_nd async %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_374 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_375 = amdaie.bd_id(%tile_65, %c1) | |
%193 = amdaie.npu.half_dma_cpy_nd async %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_375 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_376 = amdaie.bd_id(%tile_63, %c1) | |
%194 = amdaie.npu.half_dma_cpy_nd async %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_376 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_377 = amdaie.bd_id(%tile_71, %c8) | |
%195 = amdaie.npu.half_dma_cpy_nd async %26(%142 [0, 0, 96, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_377 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_378 = amdaie.bd_id(%tile_68, %c8) | |
%196 = amdaie.npu.half_dma_cpy_nd async %23(%141 [0, 0, 64, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_378 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_379 = amdaie.bd_id(%tile_65, %c8) | |
%197 = amdaie.npu.half_dma_cpy_nd async %20(%140 [0, 0, 32, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_379 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_380 = amdaie.bd_id(%tile_63, %c8) | |
%198 = amdaie.npu.half_dma_cpy_nd async %17(%139 [0, 0, 0, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_380 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
amdaie.npu.dma_wait(%191 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%192 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%193 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%194 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%195 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%196 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%197 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%198 : !amdaie.async_token) | |
%bd_id_381 = amdaie.bd_id(%tile_71, %c2) | |
%199 = amdaie.npu.half_dma_cpy_nd async %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_381 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_382 = amdaie.bd_id(%tile_68, %c2) | |
%200 = amdaie.npu.half_dma_cpy_nd async %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_382 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_383 = amdaie.bd_id(%tile_65, %c2) | |
%201 = amdaie.npu.half_dma_cpy_nd async %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_383 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_384 = amdaie.bd_id(%tile_63, %c2) | |
%202 = amdaie.npu.half_dma_cpy_nd async %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_384 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_385 = amdaie.bd_id(%tile_71, %c9) | |
%203 = amdaie.npu.half_dma_cpy_nd async %26(%142 [0, 0, 224, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_385 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_386 = amdaie.bd_id(%tile_68, %c9) | |
%204 = amdaie.npu.half_dma_cpy_nd async %23(%141 [0, 0, 192, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_386 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_387 = amdaie.bd_id(%tile_65, %c9) | |
%205 = amdaie.npu.half_dma_cpy_nd async %20(%140 [0, 0, 160, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_387 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_388 = amdaie.bd_id(%tile_63, %c9) | |
%206 = amdaie.npu.half_dma_cpy_nd async %17(%139 [0, 0, 128, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_388 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
amdaie.npu.dma_wait(%199 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%200 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%201 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%202 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%203 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%204 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%205 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%206 : !amdaie.async_token) | |
%bd_id_389 = amdaie.bd_id(%tile_71, %c3) | |
%207 = amdaie.npu.half_dma_cpy_nd async %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_389 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_390 = amdaie.bd_id(%tile_68, %c3) | |
%208 = amdaie.npu.half_dma_cpy_nd async %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_390 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_391 = amdaie.bd_id(%tile_65, %c3) | |
%209 = amdaie.npu.half_dma_cpy_nd async %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_391 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_392 = amdaie.bd_id(%tile_63, %c3) | |
%210 = amdaie.npu.half_dma_cpy_nd async %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_392 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_393 = amdaie.bd_id(%tile_71, %c10) | |
%211 = amdaie.npu.half_dma_cpy_nd async %26(%142 [0, 0, 352, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_393 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_394 = amdaie.bd_id(%tile_68, %c10) | |
%212 = amdaie.npu.half_dma_cpy_nd async %23(%141 [0, 0, 320, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_394 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_395 = amdaie.bd_id(%tile_65, %c10) | |
%213 = amdaie.npu.half_dma_cpy_nd async %20(%140 [0, 0, 288, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_395 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_396 = amdaie.bd_id(%tile_63, %c10) | |
%214 = amdaie.npu.half_dma_cpy_nd async %17(%139 [0, 0, 256, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_396 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
amdaie.npu.dma_wait(%207 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%208 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%209 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%210 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%211 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%212 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%213 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%214 : !amdaie.async_token) | |
%bd_id_397 = amdaie.bd_id(%tile_71, %c4) | |
%215 = amdaie.npu.half_dma_cpy_nd async %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_397 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_398 = amdaie.bd_id(%tile_68, %c4) | |
%216 = amdaie.npu.half_dma_cpy_nd async %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_398 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_399 = amdaie.bd_id(%tile_65, %c4) | |
%217 = amdaie.npu.half_dma_cpy_nd async %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_399 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_400 = amdaie.bd_id(%tile_63, %c4) | |
%218 = amdaie.npu.half_dma_cpy_nd async %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_400 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_401 = amdaie.bd_id(%tile_71, %c11) | |
%219 = amdaie.npu.half_dma_cpy_nd async %26(%142 [0, 0, 480, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_401 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_402 = amdaie.bd_id(%tile_68, %c11) | |
%220 = amdaie.npu.half_dma_cpy_nd async %23(%141 [0, 0, 448, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_402 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_403 = amdaie.bd_id(%tile_65, %c11) | |
%221 = amdaie.npu.half_dma_cpy_nd async %20(%140 [0, 0, 416, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_403 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_404 = amdaie.bd_id(%tile_63, %c11) | |
%222 = amdaie.npu.half_dma_cpy_nd async %17(%139 [0, 0, 384, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_404 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
amdaie.npu.dma_wait(%215 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%216 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%217 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%218 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%219 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%220 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%221 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%222 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%187 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%188 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%189 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%190 : !amdaie.async_token) | |
After Optimization: | |
---------------------------------------------------------------------------------------------------------------------------- | |
---------------------------------------------------------------------------------------------------------------------------- | |
%bd_id = amdaie.bd_id(%tile_71, %c0) | |
amdaie.npu.half_dma_cpy_nd %138(%11 [] [] [] channel = %channel_368) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%187 = amdaie.npu.half_dma_cpy_nd async %138(%150 [0, 0, 96, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id channel = %channel_369) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_370 = amdaie.bd_id(%tile_68, %c0) | |
amdaie.npu.half_dma_cpy_nd %135(%10 [] [] [] channel = %channel_366) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%188 = amdaie.npu.half_dma_cpy_nd async %135(%149 [0, 0, 64, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id_370 channel = %channel_367) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_371 = amdaie.bd_id(%tile_65, %c0) | |
amdaie.npu.half_dma_cpy_nd %132(%9 [] [] [] channel = %channel_364) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%189 = amdaie.npu.half_dma_cpy_nd async %132(%148 [0, 0, 32, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id_371 channel = %channel_365) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_372 = amdaie.bd_id(%tile_63, %c0) | |
amdaie.npu.half_dma_cpy_nd %129(%8 [] [] [] channel = %channel_362) : !amdaie.logicalobjectfifo<memref<4096xf32, 1 : i32>, 2> | |
%190 = amdaie.npu.half_dma_cpy_nd async %129(%147 [0, 0, 0, 0] [4, 4, 32, 128] [65536, 128, 512, 1] bd_id = %bd_id_372 channel = %channel_363) : !amdaie.logicalobjectfifo<memref<262144xf32>> | |
%bd_id_373 = amdaie.bd_id(%tile_71, %c1) | |
amdaie.npu.half_dma_cpy_nd %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_373 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_374 = amdaie.bd_id(%tile_68, %c1) | |
amdaie.npu.half_dma_cpy_nd %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_374 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_375 = amdaie.bd_id(%tile_65, %c1) | |
amdaie.npu.half_dma_cpy_nd %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_375 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_376 = amdaie.bd_id(%tile_63, %c1) | |
amdaie.npu.half_dma_cpy_nd %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_376 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_377 = amdaie.bd_id(%tile_71, %c8) | |
amdaie.npu.half_dma_cpy_nd %26(%142 [0, 0, 96, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_377 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_378 = amdaie.bd_id(%tile_68, %c8) | |
amdaie.npu.half_dma_cpy_nd %23(%141 [0, 0, 64, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_378 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_379 = amdaie.bd_id(%tile_65, %c8) | |
amdaie.npu.half_dma_cpy_nd %20(%140 [0, 0, 32, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_379 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_380 = amdaie.bd_id(%tile_63, %c8) | |
amdaie.npu.half_dma_cpy_nd %17(%139 [0, 0, 0, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_380 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_381 = amdaie.bd_id(%tile_71, %c2) | |
amdaie.npu.half_dma_cpy_nd %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_381 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_382 = amdaie.bd_id(%tile_68, %c2) | |
amdaie.npu.half_dma_cpy_nd %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_382 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_383 = amdaie.bd_id(%tile_65, %c2) | |
amdaie.npu.half_dma_cpy_nd %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_383 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_384 = amdaie.bd_id(%tile_63, %c2) | |
amdaie.npu.half_dma_cpy_nd %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_384 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_385 = amdaie.bd_id(%tile_71, %c9) | |
amdaie.npu.half_dma_cpy_nd %26(%142 [0, 0, 224, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_385 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_386 = amdaie.bd_id(%tile_68, %c9) | |
amdaie.npu.half_dma_cpy_nd %23(%141 [0, 0, 192, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_386 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_387 = amdaie.bd_id(%tile_65, %c9) | |
amdaie.npu.half_dma_cpy_nd %20(%140 [0, 0, 160, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_387 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_388 = amdaie.bd_id(%tile_63, %c9) | |
amdaie.npu.half_dma_cpy_nd %17(%139 [0, 0, 128, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_388 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_389 = amdaie.bd_id(%tile_71, %c3) | |
amdaie.npu.half_dma_cpy_nd %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_389 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_390 = amdaie.bd_id(%tile_68, %c3) | |
amdaie.npu.half_dma_cpy_nd %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_390 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_391 = amdaie.bd_id(%tile_65, %c3) | |
amdaie.npu.half_dma_cpy_nd %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_391 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_392 = amdaie.bd_id(%tile_63, %c3) | |
amdaie.npu.half_dma_cpy_nd %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_392 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_393 = amdaie.bd_id(%tile_71, %c10) | |
amdaie.npu.half_dma_cpy_nd %26(%142 [0, 0, 352, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_393 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_394 = amdaie.bd_id(%tile_68, %c10) | |
amdaie.npu.half_dma_cpy_nd %23(%141 [0, 0, 320, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_394 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_395 = amdaie.bd_id(%tile_65, %c10) | |
amdaie.npu.half_dma_cpy_nd %20(%140 [0, 0, 288, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_395 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_396 = amdaie.bd_id(%tile_63, %c10) | |
amdaie.npu.half_dma_cpy_nd %17(%139 [0, 0, 256, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_396 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_397 = amdaie.bd_id(%tile_71, %c4) | |
%191 = amdaie.npu.half_dma_cpy_nd async %38(%146 [0, 0, 0, 96] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_397 channel = %channel_80) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %38(%3 [] [] [] channel = %channel_81) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_398 = amdaie.bd_id(%tile_68, %c4) | |
%192 = amdaie.npu.half_dma_cpy_nd async %35(%145 [0, 0, 0, 64] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_398 channel = %channel_78) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %35(%2 [] [] [] channel = %channel_79) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_399 = amdaie.bd_id(%tile_65, %c4) | |
%193 = amdaie.npu.half_dma_cpy_nd async %32(%144 [0, 0, 0, 32] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_399 channel = %channel_76) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %32(%1 [] [] [] channel = %channel_77) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_400 = amdaie.bd_id(%tile_63, %c4) | |
%194 = amdaie.npu.half_dma_cpy_nd async %29(%143 [0, 0, 0, 0] [4, 128, 32, 32] [128, 16384, 512, 1] bd_id = %bd_id_400 channel = %channel_74) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %29(%0 [] [] [] channel = %channel_75) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_401 = amdaie.bd_id(%tile_71, %c11) | |
%195 = amdaie.npu.half_dma_cpy_nd async %26(%142 [0, 0, 480, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_401 channel = %channel_72) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %26(%7 [] [] [] channel = %channel_73) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_402 = amdaie.bd_id(%tile_68, %c11) | |
%196 = amdaie.npu.half_dma_cpy_nd async %23(%141 [0, 0, 448, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_402 channel = %channel_69) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %23(%6 [] [] [] channel = %channel_70) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_403 = amdaie.bd_id(%tile_65, %c11) | |
%197 = amdaie.npu.half_dma_cpy_nd async %20(%140 [0, 0, 416, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_403 channel = %channel_66) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %20(%5 [] [] [] channel = %channel_67) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
%bd_id_404 = amdaie.bd_id(%tile_63, %c11) | |
%198 = amdaie.npu.half_dma_cpy_nd async %17(%139 [0, 0, 384, 0] [4, 128, 32, 32] [0, 32, 4096, 1] bd_id = %bd_id_404 channel = %channel) : !amdaie.logicalobjectfifo<memref<2097152xbf16>> | |
amdaie.npu.half_dma_cpy_nd %17(%4 [] [] [] channel = %channel_64) : !amdaie.logicalobjectfifo<memref<1024xbf16, 1 : i32>, 2> | |
amdaie.npu.dma_wait(%191 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%192 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%193 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%194 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%195 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%196 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%197 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%198 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%187 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%188 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%189 : !amdaie.async_token) | |
amdaie.npu.dma_wait(%190 : !amdaie.async_token) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment