Skip to content

Instantly share code, notes, and snippets.

@JosephLaurino
Last active December 12, 2015 06:29
Show Gist options
  • Save JosephLaurino/4729783 to your computer and use it in GitHub Desktop.
Save JosephLaurino/4729783 to your computer and use it in GitHub Desktop.
Profiling experiment between CPU(non-threaded), PPL, AMP based implementations. The cost of moving data to the GPU can only be recouped if the computation time is much larger than the data transfer time. With the experiment, the GPU (via AMP) only started winning when the loop count increased passed 200 iterations. Profile first! PPL might have …
// ----------------------------------------------------------------------------
#include <ppl.h>
#include <amp.h>
#include <amp_math.h>
#include <iostream>
#include <boost/chrono/chrono.hpp>
#include <vector>
using namespace concurrency;
using namespace concurrency::precise_math;
const int size = 80000;
const int loopCount = 1;
/*
Setup: Visual Studio 2012, Win7, i7, GeForce 650M
loopCount = 1
cpu took 0.00463677 seconds
ppl took 0.00497504 seconds
gpu took 0.16018 seconds
loopCount = 20
cpu took 0.0483519 seconds
ppl took 0.0113667 seconds
gpu took 0.166711 seconds
loopCount = 200
cpu took 0.244843 seconds
ppl took 0.061472 seconds
gpu took 0.150505 seconds
loopCount = 2000
cpu took 2.26371 seconds
ppl took 0.533806 seconds
gpu took 0.202005 seconds
loopCount = 20000
cpu took 22.3698 seconds
ppl took 5.25856 seconds
gpu took 0.66479 seconds
*/
void test_PPLMethod(std::vector<float>& result) {
boost::chrono::steady_clock::time_point start = boost::chrono::steady_clock::now();
float aCPP[size];
float bCPP[size];
float sumCPP[size];
for( int i = 0; i < size; i++ ) {
aCPP[i] = i;
bCPP[i] = i*i;
}
parallel_for( 0, size, [&](int idx) {
sumCPP[idx] = pow(aCPP[idx], bCPP[idx]);
for( int i = 0; i < loopCount; i++ ) {
sumCPP[idx] = pow(sumCPP[idx], bCPP[idx]);
}
});
boost::chrono::duration<double> sec = boost::chrono::steady_clock::now() - start;
std::cout << "ppl took " << sec.count() << " seconds\n";
result.clear();
for( int i = 0; i < size; i++ ) {
result.push_back(sumCPP[i]);
}
}
void test_AmpMethod(std::vector<float>& result) {
boost::chrono::steady_clock::time_point start = boost::chrono::steady_clock::now();
float aCPP[size];
float bCPP[size];
float sumCPP[size];
for( int i = 0; i < size; i++ ) {
aCPP[i] = i;
bCPP[i] = i*i;
}
// Create C++ AMP objects.
array_view<const float, 1> a(size, aCPP);
array_view<const float, 1> b(size, bCPP);
array_view<float, 1> sum(size, sumCPP);
sum.discard_data();
parallel_for_each( sum.extent, [=](index<1> idx) restrict(amp) {
sum[idx] = pow(a[idx], b[idx]);
for( int i = 0; i < loopCount; i++ ) {
sum[idx] = pow(sum[idx], b[idx]);
}
}
);
sum.synchronize(); // needed this to copy data from GPU back to CPU
boost::chrono::duration<double> sec = boost::chrono::steady_clock::now() - start;
std::cout << "gpu took " << sec.count() << " seconds\n";
result.clear();
for( int i = 0; i < size; i++ ) {
result.push_back(sum[i]);
}
}
void test_CPUMethod(std::vector<float>& result) {
boost::chrono::steady_clock::time_point start = boost::chrono::steady_clock::now();
float aCPP[size];
float bCPP[size];
float sumCPP[size];
for( int i = 0; i < size; i++ ) {
aCPP[i] = i;
bCPP[i] = i*i;
}
for( int idx = 0; idx < size; idx++) {
sumCPP[idx] = pow(aCPP[idx], bCPP[idx]);
for( int i = 0; i < loopCount; i++ ) {
sumCPP[idx] = pow(sumCPP[idx], bCPP[idx]);
}
}
boost::chrono::duration<double> sec = boost::chrono::steady_clock::now() - start;
std::cout << "cpu took " << sec.count() << " seconds\n";
result.clear();
for( int i = 0; i < size; i++ ) {
result.push_back(sumCPP[i]);
}
}
// ----------------------------------------------------------------------------
int main(int argc, char* argv[]) {
std::vector<float> cpuResult;
std::vector<float> pplResult;
std::vector<float> gpuResult;
test_CPUMethod(cpuResult);
test_PPLMethod(pplResult);
test_AmpMethod(gpuResult);
for( int i = 0; i < cpuResult.size(); i++ )
{
if( (cpuResult[i] != pplResult[i]) ||
(cpuResult[i] != gpuResult[i]) ) {
std::cout << "bad calc at " << i << "\n";
std::cout << "cpuResult[i] " << cpuResult[i] << "\n";
std::cout << "pplResult[i] " << pplResult[i] << "\n";
std::cout << "gpuResult[i] " << gpuResult[i] << "\n";
break;
}
}
return 0;
}
@JosephLaurino
Copy link
Author

  • added data verification and the call to sum.synchronize() in test_AmpMethod, without the call to synchronize, the cost of copying data from the GPU back to the CPU was not included in the profile timings

@JosephLaurino
Copy link
Author

I also discovered that the first run of the C++ AMP code triggers the compilation of the code for GPU. To properly profile, one needs to exclude that first run and do multiple runs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment