Skip to content

Instantly share code, notes, and snippets.

@freeone3000
Created February 22, 2022 02:49
Show Gist options
  • Save freeone3000/82cabc2edb9d5b3e3c667dbf258bb9ae to your computer and use it in GitHub Desktop.
Save freeone3000/82cabc2edb9d5b3e3c667dbf258bb9ae to your computer and use it in GitHub Desktop.
GPU Test Harness for testing approximations of functions
#include <cstdint>
#include <cmath>
#include <cstdio>
/************************************************
** FUNCTION UNDER TEST ** FUNCTION UNDER TEST **
************************************************/
__device__ double f(double x) {
double u = 3.9673376333093636e-25;
u = u * x + 8.2280391928859032e-24;
u = u * x + 2.7167158481497127e-22;
u = u * x + 7.76762404858436e-21;
u = u * x + 2.1307687381845942e-19;
u = u * x + 5.533034169988047e-18;
u = u * x + 1.3570249227087202e-16;
u = u * x + 3.1324366966073745e-15;
u = u * x + 6.778726355509922e-14;
u = u * x + 1.3691488853867919e-12;
u = u * x + 2.5678435993489738e-11;
u = u * x + 4.4455382718708063e-10;
u = u * x + 7.0549116208011235e-9;
u = u * x + 1.01780860092397e-7;
u = u * x + 1.3215486790144309e-6;
u = u * x + 1.525273380405984e-5;
u = u * x + 1.540353039338161e-4;
u = u * x + 1.3333558146428443e-3;
u = u * x + 9.6181291076284772e-3;
u = u * x + 5.550410866482158e-2;
u = u * x + 2.4022650695910071e-1;
u = u * x + 6.9314718055994531e-1;
u = u * x + 1.;
return u;
}
/** harness begins here **/
typedef union {uint64_t i; double d;} magic;
constexpr uint64_t num_doubles_btw_zero_and_one = 2ull<<51;
constexpr int warp_size = 32;
__device__ double utf(uint64_t i){
magic u;
u.i = i;
return u.d;
}
__device__ uint64_t ftu(double d)
{
magic u;
u.d = d;
return u.i;
}
__global__ void vector_exp(uint64_t offset, uint64_t* out) {
/* get x value from thread idx (avoid alloc). offset allows us to reduce serially between kernel calls. */
int x_idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
double x = (utf(x_idx | ftu(1.))-1.);
/* call function under test */
double u = f(x);
/* calculate error and return */
uint64_t error = abs(exp2(x) - u);
/* combine items in this warp */
//int laneId = threadIdx.x & 0x1f;
for(int i = 16; i >= 1; i /= 2) {
error += __shfl_xor_sync(0xffffffff, error, i, 32);
}
/* save results back in buffer PER-WARP */
out[x_idx / 32] = error;
}
int main() {
/* small batches for debugging */
#if DEBUG
#define N 2048 /* small batch to test */
#else
#define N num_doubles_btw_zero_and_one
#endif /* DEBUG */
constexpr int simultaneous_threads = 1024;
constexpr int results_len = simultaneous_threads / warp_size;
uint64_t* results;
uint64_t* h_results;
cudaMalloc(&results, results_len * sizeof(uint64_t)); /* we don't use unified memory here because there is "a pair of GPUs which does not support it" */
cudaMallocHost(&h_results, results_len * sizeof(uint64_t)); /* but we can page-lock */
uint64_t acc_err = 0;
for(uint64_t offset = 0; offset < N; offset += simultaneous_threads) {
/* actually run GPU code */
vector_exp<<<1, simultaneous_threads>>>(offset, results);
cudaDeviceSynchronize();
/* serialize sum on processor */
cudaMemcpy(h_results, results, results_len * sizeof(uint64_t), cudaMemcpyDeviceToHost);
for(int i = 0; i < results_len; ++i) {
acc_err += h_results[i];
}
printf("Results after %lu runs: %lu\n", offset+simultaneous_threads, acc_err);
}
printf("===============================\n");
printf("FINAL RESULTS: %lu\n", acc_err);
printf("===============================\n");
/* clean up memory */
cudaFree(results);
cudaFreeHost(h_results);
/* return */
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment