shakram02/TiledMatrixMultiplyAMP.cpp

## TiledMatrixMultiplyAMP.cpp
void MatrixMultiplyTiled(vector<int>& vC, const vector<int>& vA,
	const vector<int>& vB, int M, int N, int W)
{
	// first mat: M × W
	// second mat: W × N

	// Note that the const before the float when creating a and b,
	// will result in the underling data only being copied to the accelerator
	// and not be copied back – a nice optimization.

	array_view<const int, 2> aMat(M, W, vA);
	array_view<const int, 2> bMat(W, N, vB);

	array_view<int, 2> resultMat(M, N, vC);
	// Indicate that we do not need to copy the data to the accelerator
	resultMat.discard_data();

	const int tile_size = 2;

	// This will run in tiles and each tiles has its own thread group
	parallel_for_each(resultMat.extent.tile<tile_size, tile_size >(),
		[=](tiled_index<tile_size, tile_size> tiled_idx) restrict(amp) {

		int row = tiled_idx.local[0];	// Current row of the operating tile thread
		int col = tiled_idx.local[1];	// Current column of the operating tile thread
		int sum = 0;	// Result of this tile thread

		tile_static int localA[tile_size][tile_size], localB[tile_size][tile_size];

#pragma region  If memory was single tile this code is enough
		//localA[row][col] = aMat(tiled_idx.global[0], col);
		//localB[row][col] = bMat(row, tiled_idx.global[0]);

		//for (int k = 0; k < TILE_SIZE; k++)
		//	sum += localA[row][k] * localB[k][col];
#pragma endregion

		for (int i = 0; i < W; i += tile_size)
		{
			localA[row][col] = aMat(tiled_idx.global[0], col + i);
			localB[row][col] = bMat(row + i, tiled_idx.global[1]);
			tiled_idx.barrier.wait();	// Wait for all tile threads to copy data

			for (int k = 0; k < tile_size; k++)
				sum += localA[row][k] * localB[k][col];
			tiled_idx.barrier.wait();	// Wait for all tile threads to compute results
		}
		// Put the result of the tile thread in its correct location in the result matrix
		resultMat[tiled_idx] = sum;

	});
	resultMat.synchronize();	// It's good practice to synchronize

	/*
	If we wanted to choose a certain accelerator we could have
	passed its view to the parallel foreach function acc.view
	*/
}
	void MatrixMultiplyTiled(vector<int>& vC, const vector<int>& vA,
	const vector<int>& vB, int M, int N, int W)
	{
	// first mat: M × W
	// second mat: W × N

	// Note that the const before the float when creating a and b,
	// will result in the underling data only being copied to the accelerator
	// and not be copied back – a nice optimization.

	array_view<const int, 2> aMat(M, W, vA);
	array_view<const int, 2> bMat(W, N, vB);

	array_view<int, 2> resultMat(M, N, vC);
	// Indicate that we do not need to copy the data to the accelerator
	resultMat.discard_data();

	const int tile_size = 2;

	// This will run in tiles and each tiles has its own thread group
	parallel_for_each(resultMat.extent.tile<tile_size, tile_size >(),
	[=](tiled_index<tile_size, tile_size> tiled_idx) restrict(amp) {

	int row = tiled_idx.local[0]; // Current row of the operating tile thread
	int col = tiled_idx.local[1]; // Current column of the operating tile thread
	int sum = 0; // Result of this tile thread

	tile_static int localA[tile_size][tile_size], localB[tile_size][tile_size];

	#pragma region If memory was single tile this code is enough
	//localA[row][col] = aMat(tiled_idx.global[0], col);
	//localB[row][col] = bMat(row, tiled_idx.global[0]);

	//for (int k = 0; k < TILE_SIZE; k++)
	// sum += localA[row][k] * localB[k][col];
	#pragma endregion

	for (int i = 0; i < W; i += tile_size)
	{
	localA[row][col] = aMat(tiled_idx.global[0], col + i);
	localB[row][col] = bMat(row + i, tiled_idx.global[1]);
	tiled_idx.barrier.wait(); // Wait for all tile threads to copy data

	for (int k = 0; k < tile_size; k++)
	sum += localA[row][k] * localB[k][col];
	tiled_idx.barrier.wait(); // Wait for all tile threads to compute results
	}
	// Put the result of the tile thread in its correct location in the result matrix
	resultMat[tiled_idx] = sum;

	});
	resultMat.synchronize(); // It's good practice to synchronize

	/*
	If we wanted to choose a certain accelerator we could have
	passed its view to the parallel foreach function acc.view
	*/
	}