Created
May 8, 2016 06:33
-
-
Save shakram02/d59d028692195f5a2961c3367b43acd2 to your computer and use it in GitHub Desktop.
AMP - Tiled matrix multiplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void MatrixMultiplyTiled(vector<int>& vC, const vector<int>& vA, | |
const vector<int>& vB, int M, int N, int W) | |
{ | |
// first mat: M × W | |
// second mat: W × N | |
// Note that the const before the float when creating a and b, | |
// will result in the underling data only being copied to the accelerator | |
// and not be copied back – a nice optimization. | |
array_view<const int, 2> aMat(M, W, vA); | |
array_view<const int, 2> bMat(W, N, vB); | |
array_view<int, 2> resultMat(M, N, vC); | |
// Indicate that we do not need to copy the data to the accelerator | |
resultMat.discard_data(); | |
const int tile_size = 2; | |
// This will run in tiles and each tiles has its own thread group | |
parallel_for_each(resultMat.extent.tile<tile_size, tile_size >(), | |
[=](tiled_index<tile_size, tile_size> tiled_idx) restrict(amp) { | |
int row = tiled_idx.local[0]; // Current row of the operating tile thread | |
int col = tiled_idx.local[1]; // Current column of the operating tile thread | |
int sum = 0; // Result of this tile thread | |
tile_static int localA[tile_size][tile_size], localB[tile_size][tile_size]; | |
#pragma region If memory was single tile this code is enough | |
//localA[row][col] = aMat(tiled_idx.global[0], col); | |
//localB[row][col] = bMat(row, tiled_idx.global[0]); | |
//for (int k = 0; k < TILE_SIZE; k++) | |
// sum += localA[row][k] * localB[k][col]; | |
#pragma endregion | |
for (int i = 0; i < W; i += tile_size) | |
{ | |
localA[row][col] = aMat(tiled_idx.global[0], col + i); | |
localB[row][col] = bMat(row + i, tiled_idx.global[1]); | |
tiled_idx.barrier.wait(); // Wait for all tile threads to copy data | |
for (int k = 0; k < tile_size; k++) | |
sum += localA[row][k] * localB[k][col]; | |
tiled_idx.barrier.wait(); // Wait for all tile threads to compute results | |
} | |
// Put the result of the tile thread in its correct location in the result matrix | |
resultMat[tiled_idx] = sum; | |
}); | |
resultMat.synchronize(); // It's good practice to synchronize | |
/* | |
If we wanted to choose a certain accelerator we could have | |
passed its view to the parallel foreach function acc.view | |
*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment