Skip to content

Instantly share code, notes, and snippets.

@shakram02
Created May 8, 2016 06:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shakram02/d59d028692195f5a2961c3367b43acd2 to your computer and use it in GitHub Desktop.
Save shakram02/d59d028692195f5a2961c3367b43acd2 to your computer and use it in GitHub Desktop.
AMP - Tiled matrix multiplication
void MatrixMultiplyTiled(vector<int>& vC, const vector<int>& vA,
const vector<int>& vB, int M, int N, int W)
{
// first mat: M × W
// second mat: W × N
// Note that the const before the float when creating a and b,
// will result in the underling data only being copied to the accelerator
// and not be copied back – a nice optimization.
array_view<const int, 2> aMat(M, W, vA);
array_view<const int, 2> bMat(W, N, vB);
array_view<int, 2> resultMat(M, N, vC);
// Indicate that we do not need to copy the data to the accelerator
resultMat.discard_data();
const int tile_size = 2;
// This will run in tiles and each tiles has its own thread group
parallel_for_each(resultMat.extent.tile<tile_size, tile_size >(),
[=](tiled_index<tile_size, tile_size> tiled_idx) restrict(amp) {
int row = tiled_idx.local[0]; // Current row of the operating tile thread
int col = tiled_idx.local[1]; // Current column of the operating tile thread
int sum = 0; // Result of this tile thread
tile_static int localA[tile_size][tile_size], localB[tile_size][tile_size];
#pragma region If memory was single tile this code is enough
//localA[row][col] = aMat(tiled_idx.global[0], col);
//localB[row][col] = bMat(row, tiled_idx.global[0]);
//for (int k = 0; k < TILE_SIZE; k++)
// sum += localA[row][k] * localB[k][col];
#pragma endregion
for (int i = 0; i < W; i += tile_size)
{
localA[row][col] = aMat(tiled_idx.global[0], col + i);
localB[row][col] = bMat(row + i, tiled_idx.global[1]);
tiled_idx.barrier.wait(); // Wait for all tile threads to copy data
for (int k = 0; k < tile_size; k++)
sum += localA[row][k] * localB[k][col];
tiled_idx.barrier.wait(); // Wait for all tile threads to compute results
}
// Put the result of the tile thread in its correct location in the result matrix
resultMat[tiled_idx] = sum;
});
resultMat.synchronize(); // It's good practice to synchronize
/*
If we wanted to choose a certain accelerator we could have
passed its view to the parallel foreach function acc.view
*/
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment