freeone3000/gpu_test_harness.cu

## gpu_test_harness.cu
#include <cstdint>
#include <cmath>
#include <cstdio>

/************************************************
 ** FUNCTION UNDER TEST ** FUNCTION UNDER TEST **
 ************************************************/
__device__ double f(double x) {
    double u = 3.9673376333093636e-25;
    u = u * x + 8.2280391928859032e-24;
    u = u * x + 2.7167158481497127e-22;
    u = u * x + 7.76762404858436e-21;
    u = u * x + 2.1307687381845942e-19;
    u = u * x + 5.533034169988047e-18;
    u = u * x + 1.3570249227087202e-16;
    u = u * x + 3.1324366966073745e-15;
    u = u * x + 6.778726355509922e-14;
    u = u * x + 1.3691488853867919e-12;
    u = u * x + 2.5678435993489738e-11;
    u = u * x + 4.4455382718708063e-10;
    u = u * x + 7.0549116208011235e-9;
    u = u * x + 1.01780860092397e-7;
    u = u * x + 1.3215486790144309e-6;
    u = u * x + 1.525273380405984e-5;
    u = u * x + 1.540353039338161e-4;
    u = u * x + 1.3333558146428443e-3;
    u = u * x + 9.6181291076284772e-3;
    u = u * x + 5.550410866482158e-2;
    u = u * x + 2.4022650695910071e-1;
    u = u * x + 6.9314718055994531e-1;
    u = u * x + 1.;
    return u;
}

/** harness begins here **/


typedef union {uint64_t i; double d;} magic;
constexpr uint64_t num_doubles_btw_zero_and_one = 2ull<<51;
constexpr int warp_size = 32;

__device__ double utf(uint64_t i){
	magic u;
	u.i = i;
	return u.d;
}

__device__ uint64_t ftu(double d)
{
	magic u;
	u.d = d;
	return u.i;
}

__global__ void vector_exp(uint64_t offset, uint64_t* out) {
    /* get x value from thread idx (avoid alloc). offset allows us to reduce serially between kernel calls. */
    int x_idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
    double x = (utf(x_idx | ftu(1.))-1.);

    /* call function under test */
    double u = f(x);

    /* calculate error and return */
    uint64_t error = abs(exp2(x) - u);

    /* combine items in this warp */
    //int laneId = threadIdx.x & 0x1f;
    for(int i = 16; i >= 1; i /= 2) {
        error += __shfl_xor_sync(0xffffffff, error, i, 32);
    }

    /* save results back in buffer PER-WARP */
    out[x_idx / 32] = error;
}

int main() {
/* small batches for debugging */
#if DEBUG
#define N 2048 /* small batch to test */
#else
#define N num_doubles_btw_zero_and_one
#endif /* DEBUG */

    constexpr int simultaneous_threads = 1024;
    constexpr int results_len = simultaneous_threads / warp_size;

    uint64_t* results;
    uint64_t* h_results;
    cudaMalloc(&results, results_len * sizeof(uint64_t)); /* we don't use unified memory here because there is "a pair of GPUs which does not support it" */
    cudaMallocHost(&h_results, results_len * sizeof(uint64_t)); /* but we can page-lock */

    uint64_t acc_err = 0;
    for(uint64_t offset = 0; offset < N; offset += simultaneous_threads) {
        /* actually run GPU code */
        vector_exp<<<1, simultaneous_threads>>>(offset, results);
        cudaDeviceSynchronize();

        /* serialize sum on processor */
	cudaMemcpy(h_results, results, results_len * sizeof(uint64_t), cudaMemcpyDeviceToHost);
	for(int i = 0; i < results_len; ++i) {
	    acc_err += h_results[i];
	}

	printf("Results after %lu runs: %lu\n", offset+simultaneous_threads, acc_err);
    }

    printf("===============================\n");
    printf("FINAL RESULTS: %lu\n", acc_err);
    printf("===============================\n");

    /* clean up memory */
    cudaFree(results);
    cudaFreeHost(h_results);

    /* return */
    return 0;
}
	#include <cstdint>
	#include <cmath>
	#include <cstdio>

	/************************************************
	FUNCTION UNDER TEST FUNCTION UNDER TEST **
	************************************************/
	__device__ double f(double x) {
	double u = 3.9673376333093636e-25;
	u = u * x + 8.2280391928859032e-24;
	u = u * x + 2.7167158481497127e-22;
	u = u * x + 7.76762404858436e-21;
	u = u * x + 2.1307687381845942e-19;
	u = u * x + 5.533034169988047e-18;
	u = u * x + 1.3570249227087202e-16;
	u = u * x + 3.1324366966073745e-15;
	u = u * x + 6.778726355509922e-14;
	u = u * x + 1.3691488853867919e-12;
	u = u * x + 2.5678435993489738e-11;
	u = u * x + 4.4455382718708063e-10;
	u = u * x + 7.0549116208011235e-9;
	u = u * x + 1.01780860092397e-7;
	u = u * x + 1.3215486790144309e-6;
	u = u * x + 1.525273380405984e-5;
	u = u * x + 1.540353039338161e-4;
	u = u * x + 1.3333558146428443e-3;
	u = u * x + 9.6181291076284772e-3;
	u = u * x + 5.550410866482158e-2;
	u = u * x + 2.4022650695910071e-1;
	u = u * x + 6.9314718055994531e-1;
	u = u * x + 1.;
	return u;
	}

	/ harness begins here /


	typedef union {uint64_t i; double d;} magic;
	constexpr uint64_t num_doubles_btw_zero_and_one = 2ull<<51;
	constexpr int warp_size = 32;

	__device__ double utf(uint64_t i){
	magic u;
	u.i = i;
	return u.d;
	}

	__device__ uint64_t ftu(double d)
	{
	magic u;
	u.d = d;
	return u.i;
	}

	__global__ void vector_exp(uint64_t offset, uint64_t* out) {
	/* get x value from thread idx (avoid alloc). offset allows us to reduce serially between kernel calls. */
	int x_idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
	double x = (utf(x_idx \| ftu(1.))-1.);

	/* call function under test */
	double u = f(x);

	/* calculate error and return */
	uint64_t error = abs(exp2(x) - u);

	/* combine items in this warp */
	//int laneId = threadIdx.x & 0x1f;
	for(int i = 16; i >= 1; i /= 2) {
	error += __shfl_xor_sync(0xffffffff, error, i, 32);
	}

	/* save results back in buffer PER-WARP */
	out[x_idx / 32] = error;
	}

	int main() {
	/* small batches for debugging */
	#if DEBUG
	#define N 2048 /* small batch to test */
	#else
	#define N num_doubles_btw_zero_and_one
	#endif /* DEBUG */

	constexpr int simultaneous_threads = 1024;
	constexpr int results_len = simultaneous_threads / warp_size;

	uint64_t* results;
	uint64_t* h_results;
	cudaMalloc(&results, results_len * sizeof(uint64_t)); /* we don't use unified memory here because there is "a pair of GPUs which does not support it" */
	cudaMallocHost(&h_results, results_len * sizeof(uint64_t)); /* but we can page-lock */

	uint64_t acc_err = 0;
	for(uint64_t offset = 0; offset < N; offset += simultaneous_threads) {
	/* actually run GPU code */
	vector_exp<<<1, simultaneous_threads>>>(offset, results);
	cudaDeviceSynchronize();

	/* serialize sum on processor */
	cudaMemcpy(h_results, results, results_len * sizeof(uint64_t), cudaMemcpyDeviceToHost);
	for(int i = 0; i < results_len; ++i) {
	acc_err += h_results[i];
	}

	printf("Results after %lu runs: %lu\n", offset+simultaneous_threads, acc_err);
	}

	printf("===============================\n");
	printf("FINAL RESULTS: %lu\n", acc_err);
	printf("===============================\n");

	/* clean up memory */
	cudaFree(results);
	cudaFreeHost(h_results);

	/* return */
	return 0;
	}