grapeot/amp_test.cpp

## amp_test.cpp
#include <amp.h>
#include <iostream>
#include <ctime>
#include <cstdlib>
#define TS 16

using namespace std;
using namespace concurrency;

int main()
{
	const int w = 4096, h = 4096;
	vector<float> d(w * h);
	for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
	cout << "Initialization done." << endl;
	array_view<float, 2> dGPU(h, w, d);
	vector<float> result(w * h);
	array_view<float, 2> resultGPU(h, w, result);
	resultGPU.discard_data();

	auto c = clock();
	parallel_for_each(dGPU.extent,
		[=](index<2> idx) restrict(amp)
	{
		int i = idx[0];
		int j = idx[1];
		float sum = (float)0;
		for (int k = 0; k < dGPU.extent[1]; k++)
			sum += dGPU(i, k) * dGPU(k, j);
		resultGPU[idx] = sum;
	});
	resultGPU.synchronize();
	cout << "GPU global version: " << clock() - c << " ms. " << endl;
	resultGPU.discard_data();

	for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
	c = clock();
	parallel_for_each(dGPU.extent.tile<TS, TS>(),
		[=](tiled_index<TS, TS> idx) restrict(amp)
	{
		tile_static float a[TS][TS], b[TS][TS];
		int xGlobal = idx.global[1], yGlobal = idx.global[0];
		int xLocal = idx.local[0], yLocal = idx.local[1];
		float sum = 0.0f;
		for (int i = 0; i < dGPU.extent[1]; i += TS)
		{
			// copy the variables from global memory to tile memory
			// note this function will be executed for every thread (in the tile)
			a[yLocal][xLocal] = dGPU(yGlobal, i + xGlobal);
			b[yLocal][xLocal] = dGPU(i + yGlobal, xGlobal);
			idx.barrier.wait();	// when all the threads finish the copy
			for (int j = 0; j < TS; j++)
				sum += a[yLocal][j] * b[j][xLocal];
			idx.barrier.wait();
		}
		resultGPU(yGlobal, xGlobal) = sum;
	});
	resultGPU.synchronize();
	cout << "GPU tiled version: " << clock() - c << " ms." << endl;

	for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
	c = clock();
	parallel_for (0, h, [&](int i)
	{
		for (int j = 0; j < w; j++)
		{
			float sum = 0;
			for (int k = 0; k < h; k++)
			{
				sum += d[i * w + k] * d[k * w + j];
			}
			result[i * w + j] = sum;
		}
	});
	cout << "CPU version: " << clock() - c << " ms." << endl;

	return 0;
}
	#include <amp.h>
	#include <iostream>
	#include <ctime>
	#include <cstdlib>
	#define TS 16

	using namespace std;
	using namespace concurrency;

	int main()
	{
	const int w = 4096, h = 4096;
	vector<float> d(w * h);
	for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
	cout << "Initialization done." << endl;
	array_view<float, 2> dGPU(h, w, d);
	vector<float> result(w * h);
	array_view<float, 2> resultGPU(h, w, result);
	resultGPU.discard_data();

	auto c = clock();
	parallel_for_each(dGPU.extent,
	[=](index<2> idx) restrict(amp)
	{
	int i = idx[0];
	int j = idx[1];
	float sum = (float)0;
	for (int k = 0; k < dGPU.extent[1]; k++)
	sum += dGPU(i, k) * dGPU(k, j);
	resultGPU[idx] = sum;
	});
	resultGPU.synchronize();
	cout << "GPU global version: " << clock() - c << " ms. " << endl;
	resultGPU.discard_data();

	for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
	c = clock();
	parallel_for_each(dGPU.extent.tile<TS, TS>(),
	[=](tiled_index<TS, TS> idx) restrict(amp)
	{
	tile_static float a[TS][TS], b[TS][TS];
	int xGlobal = idx.global[1], yGlobal = idx.global[0];
	int xLocal = idx.local[0], yLocal = idx.local[1];
	float sum = 0.0f;
	for (int i = 0; i < dGPU.extent[1]; i += TS)
	{
	// copy the variables from global memory to tile memory
	// note this function will be executed for every thread (in the tile)
	a[yLocal][xLocal] = dGPU(yGlobal, i + xGlobal);
	b[yLocal][xLocal] = dGPU(i + yGlobal, xGlobal);
	idx.barrier.wait(); // when all the threads finish the copy
	for (int j = 0; j < TS; j++)
	sum += a[yLocal][j] * b[j][xLocal];
	idx.barrier.wait();
	}
	resultGPU(yGlobal, xGlobal) = sum;
	});
	resultGPU.synchronize();
	cout << "GPU tiled version: " << clock() - c << " ms." << endl;

	for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
	c = clock();
	parallel_for (0, h, [&](int i)
	{
	for (int j = 0; j < w; j++)
	{
	float sum = 0;
	for (int k = 0; k < h; k++)
	{
	sum += d[i * w + k] * d[k * w + j];
	}
	result[i * w + j] = sum;
	}
	});
	cout << "CPU version: " << clock() - c << " ms." << endl;

	return 0;
	}