Skip to content

Instantly share code, notes, and snippets.

@grapeot
Last active December 13, 2015 23:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grapeot/4993451 to your computer and use it in GitHub Desktop.
Save grapeot/4993451 to your computer and use it in GitHub Desktop.
#include <amp.h>
#include <iostream>
#include <ctime>
#include <cstdlib>
#define TS 16
using namespace std;
using namespace concurrency;
int main()
{
const int w = 4096, h = 4096;
vector<float> d(w * h);
for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
cout << "Initialization done." << endl;
array_view<float, 2> dGPU(h, w, d);
vector<float> result(w * h);
array_view<float, 2> resultGPU(h, w, result);
resultGPU.discard_data();
auto c = clock();
parallel_for_each(dGPU.extent,
[=](index<2> idx) restrict(amp)
{
int i = idx[0];
int j = idx[1];
float sum = (float)0;
for (int k = 0; k < dGPU.extent[1]; k++)
sum += dGPU(i, k) * dGPU(k, j);
resultGPU[idx] = sum;
});
resultGPU.synchronize();
cout << "GPU global version: " << clock() - c << " ms. " << endl;
resultGPU.discard_data();
for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
c = clock();
parallel_for_each(dGPU.extent.tile<TS, TS>(),
[=](tiled_index<TS, TS> idx) restrict(amp)
{
tile_static float a[TS][TS], b[TS][TS];
int xGlobal = idx.global[1], yGlobal = idx.global[0];
int xLocal = idx.local[0], yLocal = idx.local[1];
float sum = 0.0f;
for (int i = 0; i < dGPU.extent[1]; i += TS)
{
// copy the variables from global memory to tile memory
// note this function will be executed for every thread (in the tile)
a[yLocal][xLocal] = dGPU(yGlobal, i + xGlobal);
b[yLocal][xLocal] = dGPU(i + yGlobal, xGlobal);
idx.barrier.wait(); // when all the threads finish the copy
for (int j = 0; j < TS; j++)
sum += a[yLocal][j] * b[j][xLocal];
idx.barrier.wait();
}
resultGPU(yGlobal, xGlobal) = sum;
});
resultGPU.synchronize();
cout << "GPU tiled version: " << clock() - c << " ms." << endl;
for_each(d.begin(), d.end(), [](float &i){ i = rand() / 100.0; });
c = clock();
parallel_for (0, h, [&](int i)
{
for (int j = 0; j < w; j++)
{
float sum = 0;
for (int k = 0; k < h; k++)
{
sum += d[i * w + k] * d[k * w + j];
}
result[i * w + j] = sum;
}
});
cout << "CPU version: " << clock() - c << " ms." << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment