aerosayan/NVIDIA-CUDA-10-SAMPLES-COMBINED.cpp

## NVIDIA-CUDA-10-SAMPLES-COMBINED.cpp
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <thrust/random.h>
#include <thrust/generate.h>
#include <thrust/detail/type_traits.h>

#include <helper_cuda.h>

#include <algorithm>
#include <time.h>
#include <limits.h>

template <typename T, bool floatKeys>
bool testSort(int argc, char **argv)
{
    int cmdVal;
    int keybits = 32;

    unsigned int numElements = 1048576;
    bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly");
    bool quiet    = checkCmdLineFlag(argc, (const char **)argv, "quiet");

    if (checkCmdLineFlag(argc, (const char **)argv, "n"))
    {
        cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n");
        numElements = cmdVal;

        if (cmdVal < 0)
        {
            printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal);
            exit(EXIT_SUCCESS);
        }
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "keybits"))
    {
        cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits");
        keybits = cmdVal;

        if (keybits <= 0)
        {
            printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits);
            exit(EXIT_SUCCESS);
        }
    }

    unsigned int numIterations = (numElements >= 16777216) ? 10 : 100;

    if (checkCmdLineFlag(argc, (const char **)argv, "iterations"))
    {
        cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations");
        numIterations = cmdVal;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printf("Command line:\nradixSortThrust [-option]\n");
        printf("Valid options:\n");
        printf("-n=<N>        : number of elements to sort\n");
        printf("-keybits=bits : keybits must be > 0\n");
        printf("-keysonly     : only sort an array of keys (default sorts key-value pairs)\n");
        printf("-float        : use 32-bit float keys (default is 32-bit unsigned int)\n");
        printf("-quiet        : Output only the number of elements and the time to sort\n");
        printf("-help         : Output a help message\n");
        exit(EXIT_SUCCESS);
    }

    if (!quiet)
        printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits, floatKeys ? "float" : "unsigned int", keysOnly ? "(only)" : "and values");

    int deviceID = -1;

    if (cudaSuccess == cudaGetDevice(&deviceID))
    {
        cudaDeviceProp devprop;
        cudaGetDeviceProperties(&devprop, deviceID);
        unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T);

        if (devprop.totalGlobalMem < totalMem)
        {
            printf("Error: insufficient amount of memory to sort %d elements.\n", numElements);
            printf("%d bytes needed, %d bytes available\n", (int) totalMem, (int) devprop.totalGlobalMem);
            exit(EXIT_SUCCESS);
        }
    }

    thrust::host_vector<T> h_keys(numElements);
    thrust::host_vector<T> h_keysSorted(numElements);
    thrust::host_vector<unsigned int> h_values;

    if (!keysOnly)
        h_values = thrust::host_vector<unsigned int>(numElements);

    // Fill up with some random data
    thrust::default_random_engine rng(clock());

    if (floatKeys)
    {
        thrust::uniform_real_distribution<float> u01(0, 1);

        for (int i = 0; i < (int)numElements; i++)
            h_keys[i] = u01(rng);
    }
    else
    {
        thrust::uniform_int_distribution<unsigned int> u(0, UINT_MAX);

        for (int i = 0; i < (int)numElements; i++)
            h_keys[i] = u(rng);
    }

    if (!keysOnly)
        thrust::sequence(h_values.begin(), h_values.end());

    // Copy data onto the GPU
    thrust::device_vector<T> d_keys;
    thrust::device_vector<unsigned int> d_values;

    // run multiple iterations to compute an average sort time
    cudaEvent_t start_event, stop_event;
    checkCudaErrors(cudaEventCreate(&start_event));
    checkCudaErrors(cudaEventCreate(&stop_event));

    float totalTime = 0;

    for (unsigned int i = 0; i < numIterations; i++)
    {
        // reset data before sort
        d_keys= h_keys;

        if (!keysOnly)
            d_values = h_values;

        checkCudaErrors(cudaEventRecord(start_event, 0));

        if (keysOnly)
            thrust::sort(d_keys.begin(), d_keys.end());
        else
            thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());

        checkCudaErrors(cudaEventRecord(stop_event, 0));
        checkCudaErrors(cudaEventSynchronize(stop_event));

        float time = 0;
        checkCudaErrors(cudaEventElapsedTime(&time, start_event, stop_event));
        totalTime += time;
    }

    totalTime /= (1.0e3f * numIterations);
    printf("radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements\n",
           1.0e-6f * numElements / totalTime, totalTime, numElements);

    getLastCudaError("after radixsort");

    // Get results back to host for correctness checking
    thrust::copy(d_keys.begin(), d_keys.end(), h_keysSorted.begin());

    if (!keysOnly)
        thrust::copy(d_values.begin(), d_values.end(), h_values.begin());

    getLastCudaError("copying results to host memory");

    // Check results
    bool bTestResult = thrust::is_sorted(h_keysSorted.begin(), h_keysSorted.end());

    checkCudaErrors(cudaEventDestroy(start_event));
    checkCudaErrors(cudaEventDestroy(stop_event));

    if (!bTestResult  && !quiet)
    {
        return false;
    }

    return bTestResult;
}

int main(int argc, char **argv)
{
    // Start logs
    printf("%s Starting...\n\n", argv[0]);

    findCudaDevice(argc, (const char **)argv);

    bool bTestResult = false;

    if (checkCmdLineFlag(argc, (const char **)argv, "float"))
        bTestResult = testSort<float, true>(argc, argv);
    else
        bTestResult = testSort<unsigned int, false>(argc, argv);

    printf(bTestResult ? "Test passed\n" : "Test failed!\n");
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

//
// This sample demonstrates how HyperQ allows supporting devices to avoid false
// dependencies between kernels in different streams.
//
// - Devices without HyperQ will run a maximum of two kernels at a time (one
//   kernel_A and one kernel_B).
// - Devices with HyperQ will run up to 32 kernels simultaneously.

#include <stdio.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_functions.h>
#include <helper_cuda.h>

const char *sSDKsample = "hyperQ";

// This subroutine does no real work but runs for at least the specified number
// of clock ticks.
__device__ void clock_block(clock_t *d_o, clock_t clock_count)
{
     unsigned int start_clock = (unsigned int) clock();

    clock_t clock_offset = 0;

    while (clock_offset < clock_count)
    {
        unsigned int end_clock = (unsigned int) clock();

        // The code below should work like
        // this (thanks to modular arithmetics):
        //
        // clock_offset = (clock_t) (end_clock > start_clock ?
        //                           end_clock - start_clock :
        //                           end_clock + (0xffffffffu - start_clock));
        //
        // Indeed, let m = 2^32 then
        // end - start = end + m - start (mod m).

        clock_offset = (clock_t) (end_clock - start_clock);
    }

    d_o[0] = clock_offset;
}

// We create two identical kernels calling clock_block(), we create two so that
// we can identify dependencies in the profile timeline ("kernel_B" is always
// dependent on "kernel_A" in the same stream).
__global__ void kernel_A(clock_t *d_o, clock_t clock_count)
{
    clock_block(d_o, clock_count);
}
__global__ void kernel_B(clock_t *d_o, clock_t clock_count)
{
    clock_block(d_o, clock_count);
}

// Single-warp reduction kernel (note: this is not optimized for simplicity)
__global__ void sum(clock_t *d_clocks, int N)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ clock_t s_clocks[32];

    clock_t my_sum = 0;

    for (int i = threadIdx.x ; i < N ; i += blockDim.x)
    {
        my_sum += d_clocks[i];
    }

    s_clocks[threadIdx.x] = my_sum;
    cg::sync(cta);

    for (int i = warpSize / 2 ; i > 0 ; i /= 2)
    {
        if (threadIdx.x < i)
        {
            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
        }

        cg::sync(cta);
    }

    if (threadIdx.x == 0)
    {
        d_clocks[0] = s_clocks[0];
    }
}


int main(int argc, char **argv)
{
    int nstreams = 32;          // One stream for each pair of kernels
    float kernel_time = 10;     // Time each kernel should run in ms
    float elapsed_time;
    int cuda_device = 0;

    printf("starting %s...\n", sSDKsample);

    // Get number of streams (if overridden on the command line)
    if (checkCmdLineFlag(argc, (const char **)argv, "nstreams"))
    {
        nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
    }

    // Use command-line specified CUDA device, otherwise use device with
    // highest Gflops/s
    cuda_device = findCudaDevice(argc, (const char **)argv);

    // Get device properties
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDevice(&cuda_device));
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

    // HyperQ is available in devices of Compute Capability 3.5 and higher
    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
    {
        if (deviceProp.concurrentKernels == 0)
        {
            printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n");
            printf("  CUDA kernel runs will be serialized\n");
        }
        else
        {
            printf("> GPU does not support HyperQ\n");
            printf("  CUDA kernel runs will have limited concurrency\n");
        }
    }

    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // Allocate host memory for the output (reduced to a single value)
    clock_t *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));

    // Allocate device memory for the output (one value for each kernel)
    clock_t *d_a = 0;
    checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));

    // Allocate and initialize an array of stream handles
    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

    for (int i = 0 ; i < nstreams ; i++)
    {
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
    }

    // Create CUDA event handles
    cudaEvent_t start_event, stop_event;
    checkCudaErrors(cudaEventCreate(&start_event));
    checkCudaErrors(cudaEventCreate(&stop_event));

    // Target time per kernel is kernel_time ms, clockRate is in KHz
    // Target number of clocks = target time * clock frequency
#if defined(__arm__) || defined(__aarch64__)
    // the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
#else
    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif
    clock_t total_clocks = 0;

    // Start the clock
    checkCudaErrors(cudaEventRecord(start_event, 0));

    // Queue pairs of {kernel_A, kernel_B} in separate streams
    for (int i = 0 ; i < nstreams ; ++i)
    {
        kernel_A<<<1,1,0,streams[i]>>>(&d_a[2*i], time_clocks);
        total_clocks += time_clocks;
        kernel_B<<<1,1,0,streams[i]>>>(&d_a[2*i+1], time_clocks);
        total_clocks += time_clocks;
    }

    // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
    checkCudaErrors(cudaEventRecord(stop_event, 0));

    // At this point the CPU has dispatched all work for the GPU and can
    // continue processing other tasks in parallel. In this sample we just want
    // to wait until all work is done so we use a blocking cudaMemcpy below.

    // Run the sum kernel and copy the result back to host
    sum<<<1,32>>>(d_a, 2 * nstreams);
    checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));

    // stop_event will have been recorded but including the synchronize here to
    // prevent copy/paste errors!
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));

    printf("Expected time for serial execution of %d sets of kernels is between approx. %.3fs and %.3fs\n", nstreams, (nstreams + 1) * kernel_time / 1000.0f, 2 * nstreams *kernel_time / 1000.0f);
    printf("Expected time for fully concurrent execution of %d sets of kernels is approx. %.3fs\n", nstreams, 2 * kernel_time / 1000.0f);
    printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);

    bool bTestResult  = (a[0] >= total_clocks);

    // Release resources
    for (int i = 0 ; i < nstreams ; i++)
    {
        cudaStreamDestroy(streams[i]);
    }

    free(streams);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);
    cudaFreeHost(a);
    cudaFree(d_a);

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Computation of eigenvalues of a small symmetric, tridiagonal matrix */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

// includes, project
#include "helper_functions.h"
#include "helper_cuda.h"
#include "config.h"
#include "structs.h"
#include "matlab.h"

// includes, kernels
#include "bisect_kernel_small.cuh"

// includes, file
#include "bisect_small.cuh"

////////////////////////////////////////////////////////////////////////////////
//! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX
//! @param TimingIterations  number of iterations for timing
//! @param  input  handles to input data of kernel
//! @param  result handles to result of kernel
//! @param  mat_size  matrix size
//! @param  lg  lower limit of Gerschgorin interval
//! @param  ug  upper limit of Gerschgorin interval
//! @param  precision  desired precision of eigenvalues
//! @param  iterations  number of iterations for timing
////////////////////////////////////////////////////////////////////////////////
void
computeEigenvaluesSmallMatrix(const InputData &input, ResultDataSmall &result,
                              const unsigned int mat_size,
                              const float lg, const float ug,
                              const float precision,
                              const unsigned int iterations)
{
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    for (unsigned int i = 0; i < iterations; ++i)
    {

        dim3  blocks(1, 1, 1);
        dim3  threads(MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1);

        bisectKernel<<< blocks, threads >>>(input.g_a, input.g_b, mat_size,
                                            result.g_left, result.g_right,
                                            result.g_left_count,
                                            result.g_right_count,
                                            lg, ug, 0, mat_size,
                                            precision
                                           );
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    getLastCudaError("Kernel launch failed");
    printf("Average time: %f ms (%i iterations)\n",
           sdkGetTimerValue(&timer) / (float) iterations, iterations);

    sdkDeleteTimer(&timer);
}

////////////////////////////////////////////////////////////////////////////////
//! Initialize variables and memory for the result for small matrices
//! @param result  handles to the necessary memory
//! @param  mat_size  matrix_size
////////////////////////////////////////////////////////////////////////////////
void
initResultSmallMatrix(ResultDataSmall &result, const unsigned int mat_size)
{

    result.mat_size_f = sizeof(float) * mat_size;
    result.mat_size_ui = sizeof(unsigned int) * mat_size;

    result.eigenvalues = (float *) malloc(result.mat_size_f);

    // helper variables
    result.zero_f = (float *) malloc(result.mat_size_f);
    result.zero_ui = (unsigned int *) malloc(result.mat_size_ui);

    for (unsigned int i = 0; i < mat_size; ++i)
    {

        result.zero_f[i] = 0.0f;
        result.zero_ui[i] = 0;

        result.eigenvalues[i] = 0.0f;
    }

    checkCudaErrors(cudaMalloc((void **) &result.g_left, result.mat_size_f));
    checkCudaErrors(cudaMalloc((void **) &result.g_right, result.mat_size_f));

    checkCudaErrors(cudaMalloc((void **) &result.g_left_count,
                               result.mat_size_ui));
    checkCudaErrors(cudaMalloc((void **) &result.g_right_count,
                               result.mat_size_ui));

    // initialize result memory
    checkCudaErrors(cudaMemcpy(result.g_left, result.zero_f, result.mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_right, result.zero_f, result.mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_right_count, result.zero_ui,
                               result.mat_size_ui,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_left_count, result.zero_ui,
                               result.mat_size_ui,
                               cudaMemcpyHostToDevice));
}

////////////////////////////////////////////////////////////////////////////////
//! Cleanup memory and variables for result for small matrices
//! @param  result  handle to variables
////////////////////////////////////////////////////////////////////////////////
void
cleanupResultSmallMatrix(ResultDataSmall &result)
{

    freePtr(result.eigenvalues);
    freePtr(result.zero_f);
    freePtr(result.zero_ui);

    checkCudaErrors(cudaFree(result.g_left));
    checkCudaErrors(cudaFree(result.g_right));
    checkCudaErrors(cudaFree(result.g_left_count));
    checkCudaErrors(cudaFree(result.g_right_count));
}

////////////////////////////////////////////////////////////////////////////////
//! Process the result obtained on the device, that is transfer to host and
//! perform basic sanity checking
//! @param  input  handles to input data
//! @param  result  handles to result data
//! @param  mat_size   matrix size
//! @param  filename  output filename
////////////////////////////////////////////////////////////////////////////////
void
processResultSmallMatrix(const InputData &input, const ResultDataSmall &result,
                         const unsigned int mat_size,
                         const char *filename)
{

    const unsigned int mat_size_f = sizeof(float) * mat_size;
    const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;

    // copy data back to host
    float *left = (float *) malloc(mat_size_f);
    unsigned int *left_count = (unsigned int *) malloc(mat_size_ui);

    checkCudaErrors(cudaMemcpy(left, result.g_left, mat_size_f,
                               cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(left_count, result.g_left_count, mat_size_ui,
                               cudaMemcpyDeviceToHost));

    float *eigenvalues = (float *) malloc(mat_size_f);

    for (unsigned int i = 0; i < mat_size; ++i)
    {
        eigenvalues[left_count[i]] = left[i];
    }

    // save result in matlab format
    writeTridiagSymMatlab(filename, input.a, input.b+1, eigenvalues, mat_size);

    freePtr(left);
    freePtr(left_count);
    freePtr(eigenvalues);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Computation of eigenvalues of a small bidiagonal matrix */

#ifndef _BISECT_SMALL_CUH_
#define _BISECT_SMALL_CUH_

extern "C" {

    ////////////////////////////////////////////////////////////////////////////////
    //! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX
    //! @param TimingIterations  number of iterations for timing
    //! @param  input  handles to input data of kernel
    //! @param  result handles to result of kernel
    //! @param  mat_size  matrix size
    //! @param  lg  lower limit of Gerschgorin interval
    //! @param  ug  upper limit of Gerschgorin interval
    //! @param  precision  desired precision of eigenvalues
    //! @param  iterations  number of iterations for timing
    ////////////////////////////////////////////////////////////////////////////////
    void
    computeEigenvaluesSmallMatrix(const InputData &input, ResultDataSmall &result,
                                  const unsigned int mat_size,
                                  const float lg, const float ug,
                                  const float precision,
                                  const unsigned int iterations);

    ////////////////////////////////////////////////////////////////////////////////
    //! Initialize variables and memory for the result for small matrices
    //! @param result  handles to the necessary memory
    //! @param  mat_size  matrix_size
    ////////////////////////////////////////////////////////////////////////////////
    void
    initResultSmallMatrix(ResultDataSmall &result, const unsigned int mat_size);

    ////////////////////////////////////////////////////////////////////////////////
    //! Cleanup memory and variables for result for small matrices
    //! @param  result  handle to variables
    ////////////////////////////////////////////////////////////////////////////////
    void
    cleanupResultSmallMatrix(ResultDataSmall &result);

    ////////////////////////////////////////////////////////////////////////////////
    //! Process the result obtained on the device, that is transfer to host and
    //! perform basic sanity checking
    //! @param  input   handles to input data
    //! @param  result  handles to result variables
    //! @param  mat_size   matrix size
    //! @param  filename  output filename
    ////////////////////////////////////////////////////////////////////////////////
    void
    processResultSmallMatrix(const InputData &input, const ResultDataSmall &result,
                             const unsigned int mat_size, const char *filename);

}

#endif // #ifndef _BISECT_SMALL_CUH_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Computation of eigenvalues of symmetric, tridiagonal matrix using
 * bisection.
 */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <assert.h>

// includes, project
#include <helper_functions.h>
#include <helper_cuda.h>
#include "config.h"
#include "structs.h"
#include "matlab.h"
#include "util.h"
#include "gerschgorin.h"

#include "bisect_small.cuh"
#include "bisect_large.cuh"

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
bool runTest(int argc, char **argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    bool bQAResults = false;

    printf("Starting eigenvalues\n");

    bQAResults = runTest(argc, argv);
    printf("Test %s\n", bQAResults ? "Succeeded!" : "Failed!");

    exit(bQAResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Initialize the input data to the algorithm
//! @param input  handles to the input data
//! @param exec_path  path where executable is run (argv[0])
//! @param mat_size  size of the matrix
//! @param user_defined  1 if the matrix size has been requested by the user,
//!                      0 if the default size
////////////////////////////////////////////////////////////////////////////////
void
initInputData(InputData &input, char *exec_path,
              const unsigned int mat_size, const unsigned int user_defined)
{
    // allocate memory
    input.a = (float *) malloc(sizeof(float) * mat_size);
    input.b = (float *) malloc(sizeof(float) * mat_size);

    if (1 == user_defined)
    {

        // initialize diagonal and superdiagonal entries with random values
        srand(278217421);

        // srand( clock());
        for (unsigned int i = 0; i < mat_size; ++i)
        {
            input.a[i] = (float)(2.0 * (((double)rand()
                                         / (double) RAND_MAX) - 0.5));
            input.b[i] = (float)(2.0 * (((double)rand()
                                         / (double) RAND_MAX) - 0.5));
        }

        // the first element of s is used as padding on the device (thus the
        // whole vector is copied to the device but the kernels are launched
        // with (s+1) as start address
        input.b[0] = 0.0f;
    }
    else
    {

        // read default matrix
        unsigned int input_data_size = mat_size;
        char *diag_path = sdkFindFilePath("diagonal.dat", exec_path);
        assert(NULL != diag_path);
        sdkReadFile(diag_path, &(input.a), &input_data_size, false);

        char *sdiag_path = sdkFindFilePath("superdiagonal.dat", exec_path);
        assert(NULL != sdiag_path);
        sdkReadFile(sdiag_path, &(input.b), &input_data_size, false);

        free(diag_path);
        free(sdiag_path);
    }

    // allocate device memory for input
    checkCudaErrors(cudaMalloc((void **) &(input.g_a)    , sizeof(float) * mat_size));
    checkCudaErrors(cudaMalloc((void **) &(input.g_b_raw), sizeof(float) * mat_size));

    // copy data to device
    checkCudaErrors(cudaMemcpy(input.g_a    , input.a, sizeof(float) * mat_size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(input.g_b_raw, input.b, sizeof(float) * mat_size, cudaMemcpyHostToDevice));

    input.g_b = input.g_b_raw + 1;
}

////////////////////////////////////////////////////////////////////////////////
//! Clean up input data, in particular allocated memory
//! @param input  handles to the input data
////////////////////////////////////////////////////////////////////////////////
void
cleanupInputData(InputData &input)
{

    freePtr(input.a);
    freePtr(input.b);

    checkCudaErrors(cudaFree(input.g_a));
    input.g_a = NULL;
    checkCudaErrors(cudaFree(input.g_b_raw));
    input.g_b_raw = NULL;
    input.g_b = NULL;
}

////////////////////////////////////////////////////////////////////////////////
//! Check if a specific matrix size has to be used
//! @param argc  number of command line arguments (from main(argc, argv)
//! @param argv  pointers to command line arguments (from main(argc, argv)
//! @param matrix_size  size of matrix, updated if specific size specified on
//!                     command line
////////////////////////////////////////////////////////////////////////////////
void
getMatrixSize(int argc, char **argv,
              unsigned int &mat_size, unsigned int &user_defined)
{
    int temp = -1;

    if (checkCmdLineFlag(argc, (const char **)argv, "matrix-size"))
    {
        temp = getCmdLineArgumentInt(argc, (const char **) argv, "matrix-size");
    }

    if (temp > 0)
    {

        mat_size = (unsigned int) temp;
        // data type short is used in the kernel
        assert(mat_size < (1 << 16));

        // mat_size should be large than 2
        assert(mat_size >= 2);

        user_defined = 1;
    }

    printf("Matrix size: %i x %i\n", mat_size, mat_size);
}

////////////////////////////////////////////////////////////////////////////////
//! Check if a specific precision of the eigenvalue has to be obtained
//! @param argc  number of command line arguments (from main(argc, argv)
//! @param argv  pointers to command line arguments (from main(argc, argv)
//! @param iters_timing  numbers of iterations for timing, updated if a
//!                      specific number is specified on the command line
//! @param user_defined  1 if the precision has been requested by the user,
//!                      0 if the default size
////////////////////////////////////////////////////////////////////////////////
void
getPrecision(int argc, char **argv, float &precision, unsigned int &user_defined)
{

    float temp = -1.0f;

    if (checkCmdLineFlag(argc, (const char **)argv, "precision"))
    {
        temp = getCmdLineArgumentFloat(argc, (const char **) argv, "precision");
        printf("Precision is between [0.001, 0.000001]\n");
    }

    if (temp > 1e-6 && temp <= 0.001)
    {
        precision = temp;
        user_defined = 1;
    }

    printf("Precision: %f\n", precision);
}

////////////////////////////////////////////////////////////////////////////////
//! Check if a particular number of iterations for timings has to be used
//! @param argc  number of command line arguments (from main(argc, argv)
//! @param argv  pointers to command line arguments (from main(argc, argv)
//! @param  iters_timing  number of timing iterations, updated if user
//!                       specific value
////////////////////////////////////////////////////////////////////////////////
void
getItersTiming(int argc, char **argv, unsigned int &iters_timing)
{

    int temp = -1;

    if (checkCmdLineFlag(argc, (const char **)argv, "iters-timing"))
    {
        temp = getCmdLineArgumentInt(argc, (const char **) argv, "iters-timing");
    }

    if (temp > 0)
    {
        iters_timing = temp;
    }

    printf("Iterations to be timed: %i\n", iters_timing);
}

////////////////////////////////////////////////////////////////////////////////
//! Check if a particular filename has to be used for the file where the result
//! is stored
//! @param argc  number of command line arguments (from main(argc, argv)
//! @param argv  pointers to command line arguments (from main(argc, argv)
//! @param  filename  filename of result file, updated if user specified
//!                   filename
////////////////////////////////////////////////////////////////////////////////
void
getResultFilename(int argc, char **argv, char *&filename)
{

    char *temp = NULL;
    getCmdLineArgumentString(argc, (const char **) argv, "filename-result",
                             &temp);

    if (NULL != temp)
    {

        filename = (char *) malloc(sizeof(char) * strlen(temp));
        strcpy(filename, temp);

        free(temp);
    }

    printf("Result filename: '%s'\n", filename);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
bool
runTest(int argc, char **argv)
{
    bool bCompareResult = false;

    findCudaDevice(argc, (const char **)argv);

    StopWatchInterface *timer = NULL;
    StopWatchInterface *timer_total = NULL;
    sdkCreateTimer(&timer);
    sdkCreateTimer(&timer_total);

    // default
    unsigned int mat_size = 2048;
    // flag if the matrix size is due to explicit user request
    unsigned int user_defined = 0;
    // desired precision of eigenvalues
    float  precision = 0.00001f;
    unsigned int iters_timing = 100;
    char *result_file = (char *)"eigenvalues.dat";

    // check if there is a command line request for the matrix size
    getMatrixSize(argc, argv, mat_size, user_defined);

    // check if user requested specific precision
    getPrecision(argc, argv, precision, user_defined);

    // check if user requested specific number of iterations for timing
    getItersTiming(argc, argv, iters_timing);

    // file name for result file
    getResultFilename(argc, argv, result_file);

    // set up input
    InputData input;
    initInputData(input, argv[0], mat_size, user_defined);

    // compute Gerschgorin interval
    float lg = FLT_MAX;
    float ug = -FLT_MAX;
    computeGerschgorin(input.a, input.b+1, mat_size, lg, ug);
    printf("Gerschgorin interval: %f / %f\n", lg, ug);

    // two kernels, for small matrices a lot of overhead can be avoided
    if (mat_size <= MAX_SMALL_MATRIX)
    {

        // initialize memory for result
        ResultDataSmall result;
        initResultSmallMatrix(result, mat_size);

        // run the kernel
        computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug,
                                      precision, iters_timing);

        // get the result from the device and do some sanity checks,
        // save the result
        processResultSmallMatrix(input, result, mat_size, result_file);

        // clean up
        cleanupResultSmallMatrix(result);

        printf("User requests non-default argument(s), skipping self-check!\n");
        bCompareResult = true;
    }
    else
    {

        // initialize memory for result
        ResultDataLarge  result;
        initResultDataLargeMatrix(result, mat_size);

        // run the kernel
        computeEigenvaluesLargeMatrix(input, result, mat_size,
                                      precision, lg, ug,
                                      iters_timing);

        // get the result from the device and do some sanity checks
        // save the result if user specified matrix size
        bCompareResult = processResultDataLargeMatrix(input, result, mat_size, result_file,
                                                      user_defined, argv[0]);

        // cleanup
        cleanupResultDataLargeMatrix(result);
    }

    cleanupInputData(input);

    return bCompareResult;
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Determine eigenvalues for small symmetric, tridiagonal matrix */

#ifndef _BISECT_KERNEL_SMALL_H_
#define _BISECT_KERNEL_SMALL_H_

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// includes, project
#include "config.h"
#include "util.h"

// additional kernel
#include "bisect_util.cu"


////////////////////////////////////////////////////////////////////////////////
//! Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
//! @param  g_d  diagonal elements in global memory
//! @param  g_s  superdiagonal elements in global elements (stored so that the
//!              element *(g_s - 1) can be accessed an equals 0
//! @param  n   size of matrix
//! @param  lg  lower bound of input interval (e.g. Gerschgorin interval)
//! @param  ug  upper bound of input interval (e.g. Gerschgorin interval)
//! @param  lg_eig_count  number of eigenvalues that are smaller than \a lg
//! @param  lu_eig_count  number of eigenvalues that are smaller than \a lu
//! @param  epsilon  desired accuracy of eigenvalues to compute
////////////////////////////////////////////////////////////////////////////////
__global__
void
bisectKernel(float *g_d, float *g_s, const unsigned int n,
             float *g_left, float *g_right,
             unsigned int *g_left_count, unsigned int *g_right_count,
             const float lg, const float ug,
             const unsigned int lg_eig_count, const unsigned int ug_eig_count,
             float epsilon
            )
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // intervals (store left and right because the subdivision tree is in general
    // not dense
    __shared__  float  s_left[MAX_THREADS_BLOCK_SMALL_MATRIX];
    __shared__  float  s_right[MAX_THREADS_BLOCK_SMALL_MATRIX];

    // number of eigenvalues that are smaller than s_left / s_right
    // (correspondence is realized via indices)
    __shared__  unsigned int  s_left_count[MAX_THREADS_BLOCK_SMALL_MATRIX];
    __shared__  unsigned int  s_right_count[MAX_THREADS_BLOCK_SMALL_MATRIX];

    // helper for stream compaction
    __shared__  unsigned int
    s_compaction_list[MAX_THREADS_BLOCK_SMALL_MATRIX + 1];

    // state variables for whole block
    // if 0 then compaction of second chunk of child intervals is not necessary
    // (because all intervals had exactly one non-dead child)
    __shared__  unsigned int compact_second_chunk;
    __shared__  unsigned int all_threads_converged;

    // number of currently active threads
    __shared__  unsigned int num_threads_active;

    // number of threads to use for stream compaction
    __shared__  unsigned int num_threads_compaction;

    // helper for exclusive scan
    unsigned int *s_compaction_list_exc = s_compaction_list + 1;


    // variables for currently processed interval
    // left and right limit of active interval
    float  left = 0.0f;
    float  right = 0.0f;
    unsigned int left_count = 0;
    unsigned int right_count = 0;
    // midpoint of active interval
    float  mid = 0.0f;
    // number of eigenvalues smaller then mid
    unsigned int mid_count = 0;
    // affected from compaction
    unsigned int  is_active_second = 0;

    s_compaction_list[threadIdx.x] = 0;
    s_left[threadIdx.x] = 0;
    s_right[threadIdx.x] = 0;
    s_left_count[threadIdx.x] = 0;
    s_right_count[threadIdx.x] = 0;

    cg::sync(cta);

    // set up initial configuration
    if (0 == threadIdx.x)
    {
        s_left[0] = lg;
        s_right[0] = ug;
        s_left_count[0] = lg_eig_count;
        s_right_count[0] = ug_eig_count;

        compact_second_chunk = 0;
        num_threads_active = 1;

        num_threads_compaction = 1;
    }

    // for all active threads read intervals from the last level
    // the number of (worst case) active threads per level l is 2^l
    while (true)
    {

        all_threads_converged = 1;
        cg::sync(cta);

        is_active_second = 0;
        subdivideActiveInterval(threadIdx.x,
                                s_left, s_right, s_left_count, s_right_count,
                                num_threads_active,
                                left, right, left_count, right_count,
                                mid, all_threads_converged);

        cg::sync(cta);

        // check if done
        if (1 == all_threads_converged)
        {
            break;
        }

        cg::sync(cta);

        // compute number of eigenvalues smaller than mid
        // use all threads for reading the necessary matrix data from global
        // memory
        // use s_left and s_right as scratch space for diagonal and
        // superdiagonal of matrix
        mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,
                                               threadIdx.x, num_threads_active,
                                               s_left, s_right,
                                               (left == right), cta);

        cg::sync(cta);

        // store intervals
        // for all threads store the first child interval in a continuous chunk of
        // memory, and the second child interval -- if it exists -- in a second
        // chunk; it is likely that all threads reach convergence up to
        // \a epsilon at the same level; furthermore, for higher level most / all
        // threads will have only one child, storing the first child compactly will
        // (first) avoid to perform a compaction step on the first chunk, (second)
        // make it for higher levels (when all threads / intervals have
        // exactly one child)  unnecessary to perform a compaction of the second
        // chunk
        if (threadIdx.x < num_threads_active)
        {

            if (left != right)
            {

                // store intervals
                storeNonEmptyIntervals(threadIdx.x, num_threads_active,
                                       s_left, s_right, s_left_count, s_right_count,
                                       left, mid, right,
                                       left_count, mid_count, right_count,
                                       epsilon, compact_second_chunk,
                                       s_compaction_list_exc,
                                       is_active_second);
            }
            else
            {

                storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
                                       left, mid, right,
                                       left_count, mid_count, right_count,
                                       s_compaction_list_exc, compact_second_chunk,
                                       num_threads_active,
                                       is_active_second);
            }
        }

        // necessary so that compact_second_chunk is up-to-date
        cg::sync(cta);

        // perform compaction of chunk where second children are stored
        // scan of (num_threads_active / 2) elements, thus at most
        // (num_threads_active / 4) threads are needed
        if (compact_second_chunk > 0)
        {

            createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, cta);

            compactIntervals(s_left, s_right, s_left_count, s_right_count,
                             mid, right, mid_count, right_count,
                             s_compaction_list, num_threads_active,
                             is_active_second);
        }

        cg::sync(cta);

        if (0 == threadIdx.x)
        {

            // update number of active threads with result of reduction
            num_threads_active += s_compaction_list[num_threads_active];

            num_threads_compaction = ceilPow2(num_threads_active);

            compact_second_chunk = 0;
        }

        cg::sync(cta);

    }

    cg::sync(cta);

    // write resulting intervals to global mem
    // for all threads write if they have been converged to an eigenvalue to
    // a separate array

    // at most n valid intervals
    if (threadIdx.x < n)
    {

        // intervals converged so left and right limit are identical
        g_left[threadIdx.x]  = s_left[threadIdx.x];
        // left count is sufficient to have global order
        g_left_count[threadIdx.x]  = s_left_count[threadIdx.x];
    }

}

#endif // #ifndef _BISECT_KERNEL_SMALL_H_

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Computation of eigenvalues of a small bidiagonal matrix */

#ifndef _BISECT_LARGE_CUH_
#define _BISECT_LARGE_CUH_

extern "C" {

    ////////////////////////////////////////////////////////////////////////////////
    //! Run the kernels to compute the eigenvalues for large matrices
    //! @param  input   handles to input data
    //! @param  result  handles to result data
    //! @param  mat_size  matrix size
    //! @param  precision  desired precision of eigenvalues
    //! @param  lg  lower limit of Gerschgorin interval
    //! @param  ug  upper limit of Gerschgorin interval
    //! @param  iterations  number of iterations (for timing)
    ////////////////////////////////////////////////////////////////////////////////
    void
    computeEigenvaluesLargeMatrix(const InputData &input, const ResultDataLarge &result,
                                  const unsigned int mat_size, const float precision,
                                  const float lg, const float ug,
                                  const unsigned int iterations);

    ////////////////////////////////////////////////////////////////////////////////
    //! Initialize variables and memory for result
    //! @param  result handles to memory
    //! @param  matr_size  size of the matrix
    ////////////////////////////////////////////////////////////////////////////////
    void
    initResultDataLargeMatrix(ResultDataLarge &result, const unsigned int mat_size);

    ////////////////////////////////////////////////////////////////////////////////
    //! Cleanup result memory
    //! @param result  handles to memory
    ////////////////////////////////////////////////////////////////////////////////
    void
    cleanupResultDataLargeMatrix(ResultDataLarge &result);

    ////////////////////////////////////////////////////////////////////////////////
    //! Process the result, that is obtain result from device and do simple sanity
    //! checking
    //! @param  input   handles to input data
    //! @param  result  handles to result data
    //! @param  mat_size  matrix size
    //! @param  filename  output filename
    ////////////////////////////////////////////////////////////////////////////////
    bool
    processResultDataLargeMatrix(const InputData &input, const ResultDataLarge &result,
                                 const unsigned int mat_size,
                                 const char *filename,
                                 const unsigned int user_defined, char *exec_path);

};

#endif // #ifndef _BISECT_LARGE_CUH_

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Utility / shared functionality for bisection kernels */

#ifndef _BISECT_UTIL_H_
#define _BISECT_UTIL_H_

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// includes, project
#include "config.h"
#include "util.h"

////////////////////////////////////////////////////////////////////////////////
//! Compute the next lower power of two of n
//! @param  n  number for which next higher power of two is sought
////////////////////////////////////////////////////////////////////////////////
__device__
inline int
floorPow2(int n)
{

    // early out if already power of two
    if (0 == (n & (n-1)))
    {
        return n;
    }

    int exp;
    frexp((float)n, &exp);
    return (1 << (exp - 1));
}

////////////////////////////////////////////////////////////////////////////////
//! Compute the next higher power of two of n
//! @param  n  number for which next higher power of two is sought
////////////////////////////////////////////////////////////////////////////////
__device__
inline int
ceilPow2(int n)
{

    // early out if already power of two
    if (0 == (n & (n-1)))
    {
        return n;
    }

    int exp;
    frexp((float)n, &exp);
    return (1 << exp);
}

////////////////////////////////////////////////////////////////////////////////
//! Compute midpoint of interval [\a left, \a right] avoiding overflow if
//! possible
//! @param left   left / lower limit of interval
//! @param right  right / upper limit of interval
////////////////////////////////////////////////////////////////////////////////
__device__
inline float
computeMidpoint(const float left, const float right)
{

    float mid;

    if (sign_f(left) == sign_f(right))
    {
        mid = left + (right - left) * 0.5f;
    }
    else
    {
        mid = (left + right) * 0.5f;
    }

    return mid;
}

////////////////////////////////////////////////////////////////////////////////
//! Check if interval converged and store appropriately
//! @param  addr    address where to store the information of the interval
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  left   lower limit of interval
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  right_count  eigenvalues less than \a right
//! @param  precision  desired precision for eigenvalues
////////////////////////////////////////////////////////////////////////////////
template<class S, class T>
__device__
void
storeInterval(unsigned int addr,
              float *s_left, float *s_right,
              T *s_left_count, T *s_right_count,
              float left, float right,
              S left_count, S right_count,
              float precision)
{
    s_left_count[addr] = left_count;
    s_right_count[addr] = right_count;

    // check if interval converged
    float t0 = abs(right - left);
    float t1 = max(abs(left), abs(right)) * precision;

    if (t0 <= max(MIN_ABS_INTERVAL, t1))
    {

        // compute mid point
        float lambda = computeMidpoint(left, right);

        // mark as converged
        s_left[addr] = lambda;
        s_right[addr] = lambda;
    }
    else
    {

        // store current limits
        s_left[addr] = left;
        s_right[addr] = right;
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute number of eigenvalues that are smaller than x given a symmetric,
//! real, and tridiagonal matrix
//! @param  g_d  diagonal elements stored in global memory
//! @param  g_s  superdiagonal elements stored in global memory
//! @param  n    size of matrix
//! @param  x    value for which the number of eigenvalues that are smaller is
//!              seeked
//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
//! @param  num_intervals_active  number of active intervals / threads that
//!                               currently process an interval
//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  s_s  scratch space to store superdiagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  converged  flag if the current thread is already converged (that
//!         is count does not have to be computed)
////////////////////////////////////////////////////////////////////////////////
__device__
inline unsigned int
computeNumSmallerEigenvals(float *g_d, float *g_s, const unsigned int n,
                           const float x,
                           const unsigned int tid,
                           const unsigned int num_intervals_active,
                           float *s_d, float *s_s,
                           unsigned int converged,
                           cg::thread_block cta
                          )
{

    float  delta = 1.0f;
    unsigned int count = 0;

    cg::sync(cta);

    // read data into shared memory
    if (threadIdx.x < n)
    {
        s_d[threadIdx.x] = *(g_d + threadIdx.x);
        s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
    }

    cg::sync(cta);

    // perform loop only for active threads
    if ((tid < num_intervals_active) && (0 == converged))
    {

        // perform (optimized) Gaussian elimination to determine the number
        // of eigenvalues that are smaller than n
        for (unsigned int k = 0; k < n; ++k)
        {
            delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
            count += (delta < 0) ? 1 : 0;
        }

    }  // end if thread currently processing an interval

    return count;
}

////////////////////////////////////////////////////////////////////////////////
//! Compute number of eigenvalues that are smaller than x given a symmetric,
//! real, and tridiagonal matrix
//! @param  g_d  diagonal elements stored in global memory
//! @param  g_s  superdiagonal elements stored in global memory
//! @param  n    size of matrix
//! @param  x    value for which the number of eigenvalues that are smaller is
//!              seeked
//! @param  tid  thread identified (e.g. threadIdx.x or gtid)
//! @param  num_intervals_active  number of active intervals / threads that
//!                               currently process an interval
//! @param  s_d  scratch space to store diagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  s_s  scratch space to store superdiagonal entries of the tridiagonal
//!              matrix in shared memory
//! @param  converged  flag if the current thread is already converged (that
//!         is count does not have to be computed)
////////////////////////////////////////////////////////////////////////////////
__device__
inline unsigned int
computeNumSmallerEigenvalsLarge(float *g_d, float *g_s, const unsigned int n,
                                const float x,
                                const unsigned int tid,
                                const unsigned int num_intervals_active,
                                float *s_d, float *s_s,
                                unsigned int converged,
                                cg::thread_block cta
                               )
{
    float  delta = 1.0f;
    unsigned int count = 0;

    unsigned int rem = n;

    // do until whole diagonal and superdiagonal has been loaded and processed
    for (unsigned int i = 0; i < n; i += blockDim.x)
    {

        cg::sync(cta);

        // read new chunk of data into shared memory
        if ((i + threadIdx.x) < n)
        {

            s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
            s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
        }

        cg::sync(cta);


        if (tid < num_intervals_active)
        {

            // perform (optimized) Gaussian elimination to determine the number
            // of eigenvalues that are smaller than n
            for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
            {
                delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
                // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
                count += (delta < 0) ? 1 : 0;
            }

        }  // end if thread currently processing an interval

        rem -= blockDim.x;
    }

    return count;
}

////////////////////////////////////////////////////////////////////////////////
//! Store all non-empty intervals resulting from the subdivision of the interval
//! currently processed by the thread
//! @param  addr  base address for storing intervals
//! @param  num_threads_active  number of threads / intervals in current sweep
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  left   lower limit of interval
//! @param  mid    midpoint of interval
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  mid_count  eigenvalues less than \a mid
//! @param  right_count  eigenvalues less than \a right
//! @param  precision  desired precision for eigenvalues
//! @param  compact_second_chunk  shared mem flag if second chunk is used and
//!                               ergo requires compaction
//! @param  s_compaction_list_exc  helper array for stream compaction,
//!                                s_compaction_list_exc[tid] = 1 when the
//!                                thread generated two child intervals
//! @is_active_interval  mark is thread has a second non-empty child interval
////////////////////////////////////////////////////////////////////////////////
template<class S, class T>
__device__
void
storeNonEmptyIntervals(unsigned int addr,
                       const unsigned int num_threads_active,
                       float  *s_left, float *s_right,
                       T  *s_left_count, T *s_right_count,
                       float left, float mid, float right,
                       const S left_count,
                       const S mid_count,
                       const S right_count,
                       float precision,
                       unsigned int &compact_second_chunk,
                       T *s_compaction_list_exc,
                       unsigned int &is_active_second)
{
    // check if both child intervals are valid
    if ((left_count != mid_count) && (mid_count != right_count))
    {

        // store the left interval
        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
                      left, mid, left_count, mid_count, precision);

        // mark that a second interval has been generated, only stored after
        // stream compaction of second chunk
        is_active_second = 1;
        s_compaction_list_exc[threadIdx.x] = 1;
        atomicExch(&compact_second_chunk, 1);
    }
    else
    {

        // only one non-empty child interval

        // mark that no second child
        is_active_second = 0;
        s_compaction_list_exc[threadIdx.x] = 0;

        // store the one valid child interval
        if (left_count != mid_count)
        {
            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
                          left, mid, left_count, mid_count, precision);
        }
        else
        {
            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
                          mid, right, mid_count, right_count, precision);
        }

    }
}
////////////////////////////////////////////////////////////////////////////////
//! Create indices for compaction, that is process \a s_compaction_list_exc
//! which is 1 for intervals that generated a second child and 0 otherwise
//! and create for each of the non-zero elements the index where the new
//! interval belongs to in a compact representation of all generated second
//! childs
//! @param   s_compaction_list_exc  list containing the flags which threads
//!                                 generated two children
//! @param   num_threads_compaction number of threads to employ for compaction
////////////////////////////////////////////////////////////////////////////////
template<class T>
__device__
void
createIndicesCompaction(T *s_compaction_list_exc,
                        unsigned int num_threads_compaction, cg::thread_block cta)
{

    unsigned int offset = 1;
    const unsigned int tid = threadIdx.x;

    // higher levels of scan tree
    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
    {

        cg::sync(cta);

        if (tid < d)
        {

            unsigned int  ai = offset*(2*tid+1)-1;
            unsigned int  bi = offset*(2*tid+2)-1;

            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
                                          + s_compaction_list_exc[ai];
        }

        offset <<= 1;
    }

    // traverse down tree: first down to level 2 across
    for (int d = 2; d < num_threads_compaction; d <<= 1)
    {

        offset >>= 1;
        cg::sync(cta);

        if (tid < (d-1))
        {

            unsigned int  ai = offset*(tid+1) - 1;
            unsigned int  bi = ai + (offset >> 1);

            s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]
                                          + s_compaction_list_exc[ai];
        }
    }

    cg::sync(cta);

}

///////////////////////////////////////////////////////////////////////////////
//! Perform stream compaction for second child intervals
//! @param  s_left  shared
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  mid    midpoint of current interval (left of new interval)
//! @param  right  upper limit of interval
//! @param  mid_count  eigenvalues less than \a mid
//! @param  s_compaction_list  list containing the indices where the data has
//!         to be stored
//! @param  num_threads_active  number of active threads / intervals
//! @is_active_interval  mark is thread has a second non-empty child interval
///////////////////////////////////////////////////////////////////////////////
template<class T>
__device__
void
compactIntervals(float *s_left, float *s_right,
                 T *s_left_count, T *s_right_count,
                 float mid, float right,
                 unsigned int mid_count, unsigned int right_count,
                 T *s_compaction_list,
                 unsigned int num_threads_active,
                 unsigned int is_active_second)
{
    const unsigned int tid = threadIdx.x;

    // perform compaction / copy data for all threads where the second
    // child is not dead
    if ((tid < num_threads_active) && (1 == is_active_second))
    {
        unsigned int addr_w = num_threads_active + s_compaction_list[tid];

        s_left[addr_w] = mid;
        s_right[addr_w] = right;
        s_left_count[addr_w] = mid_count;
        s_right_count[addr_w] = right_count;
    }
}

///////////////////////////////////////////////////////////////////////////////
//! Store intervals that have already converged (w.r.t. the desired precision),
//! duplicating intervals that contain multiple eigenvalues
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  left   lower limit of interval
//! @param  mid    midpoint of interval (updated if split is necessary)
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  mid_count  eigenvalues less than \a mid
//! @param  right_count  eigenvalues less than \a right
//! @param  s_compaction_list_exc  helper array for stream compaction, updated
//!                                at tid if split is necessary
//! @param  compact_second_chunk  shared mem flag if second chunk is used and
//!                               ergo requires compaction
//! @param  num_threads_active  number of active threads / intervals
///////////////////////////////////////////////////////////////////////////////
template<class T, class S>
__device__
void
storeIntervalConverged(float *s_left, float *s_right,
                       T *s_left_count, T *s_right_count,
                       float &left, float &mid, float &right,
                       S &left_count, S &mid_count, S &right_count,
                       T *s_compaction_list_exc,
                       unsigned int &compact_second_chunk,
                       const unsigned int num_threads_active)
{
    const unsigned int tid = threadIdx.x;
    const unsigned int multiplicity = right_count - left_count;

    // check multiplicity of eigenvalue
    if (1 == multiplicity)
    {

        // just re-store intervals, simple eigenvalue
        s_left[tid] = left;
        s_right[tid] = right;
        s_left_count[tid] = left_count;
        s_right_count[tid] = right_count;

        // mark that no second child / clear
        s_right_count[tid + num_threads_active] = 0;
        s_compaction_list_exc[tid] = 0;
    }
    else
    {

        // number of eigenvalues after the split less than mid
        mid_count = left_count + (multiplicity >> 1);

        // store left interval
        s_left[tid] = left;
        s_right[tid] = right;
        s_left_count[tid] = left_count;
        s_right_count[tid] = mid_count;

        mid = left;

        // mark that second child interval exists
        s_right_count[tid + num_threads_active] = right_count;
        s_compaction_list_exc[tid] = 1;
        compact_second_chunk = 1;
    }
}

template<class T, class S>
__device__
void
storeIntervalConverged(float *s_left, float *s_right,
                       T *s_left_count, T *s_right_count,
                       float &left, float &mid, float &right,
                       S &left_count, S &mid_count, S &right_count,
                       T *s_compaction_list_exc,
                       unsigned int &compact_second_chunk,
                       const unsigned int num_threads_active,
                       unsigned int &is_active_second)
{
    const unsigned int tid = threadIdx.x;
    const unsigned int multiplicity = right_count - left_count;

    // check multiplicity of eigenvalue
    if (1 == multiplicity)
    {

        // just re-store intervals, simple eigenvalue
        s_left[tid] = left;
        s_right[tid] = right;
        s_left_count[tid] = left_count;
        s_right_count[tid] = right_count;

        // mark that no second child / clear
        is_active_second = 0;
        s_compaction_list_exc[tid] = 0;
    }
    else
    {

        // number of eigenvalues after the split less than mid
        mid_count = left_count + (multiplicity >> 1);

        // store left interval
        s_left[tid] = left;
        s_right[tid] = right;
        s_left_count[tid] = left_count;
        s_right_count[tid] = mid_count;

        mid = left;

        // mark that second child interval exists
        is_active_second = 1;
        s_compaction_list_exc[tid] = 1;
        compact_second_chunk = 1;
    }
}

///////////////////////////////////////////////////////////////////////////////
//! Subdivide interval if active and not already converged
//! @param tid  id of thread
//! @param  s_left  shared memory storage for left interval limits
//! @param  s_right  shared memory storage for right interval limits
//! @param  s_left_count  shared memory storage for number of eigenvalues less
//!                       than left interval limits
//! @param  s_right_count  shared memory storage for number of eigenvalues less
//!                       than right interval limits
//! @param  num_threads_active  number of active threads in warp
//! @param  left   lower limit of interval
//! @param  right  upper limit of interval
//! @param  left_count  eigenvalues less than \a left
//! @param  right_count  eigenvalues less than \a right
//! @param  all_threads_converged  shared memory flag if all threads are
//!                                 converged
///////////////////////////////////////////////////////////////////////////////
template<class T>
__device__
void
subdivideActiveInterval(const unsigned int tid,
                        float *s_left, float *s_right,
                        T *s_left_count, T *s_right_count,
                        const unsigned int num_threads_active,
                        float &left, float &right,
                        unsigned int &left_count, unsigned int &right_count,
                        float &mid, unsigned int &all_threads_converged)
{
    // for all active threads
    if (tid < num_threads_active)
    {

        left = s_left[tid];
        right = s_right[tid];
        left_count = s_left_count[tid];
        right_count = s_right_count[tid];

        // check if thread already converged
        if (left != right)
        {

            mid = computeMidpoint(left, right);
            atomicExch(&all_threads_converged, 0);
        }
        else if ((right_count - left_count) > 1)
        {
            // mark as not converged if multiple eigenvalues enclosed
            // duplicate interval in storeIntervalsConverged()
            atomicExch(&all_threads_converged, 0);
        }

    }  // end for all active threads
}


#endif // #ifndef _BISECT_UTIL_H_


/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
  step of the computation. */

#ifndef _BISECT_KERNEL_LARGE_H_
#define _BISECT_KERNEL_LARGE_H_
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
// includes, project
#include "config.h"
#include "util.h"

// additional kernel
#include "bisect_util.cu"

// declaration, forward

////////////////////////////////////////////////////////////////////////////////
//! Write data to global memory
////////////////////////////////////////////////////////////////////////////////
__device__
void writeToGmem(const unsigned int tid, const unsigned int tid_2,
                 const unsigned int num_threads_active,
                 const unsigned int num_blocks_mult,
                 float *g_left_one, float *g_right_one,
                 unsigned int *g_pos_one,
                 float *g_left_mult, float *g_right_mult,
                 unsigned int *g_left_count_mult,
                 unsigned int *g_right_count_mult,
                 float *s_left, float *s_right,
                 unsigned short *s_left_count, unsigned short *s_right_count,
                 unsigned int *g_blocks_mult,
                 unsigned int *g_blocks_mult_sum,
                 unsigned short *s_compaction_list,
                 unsigned short *s_cl_helper,
                 unsigned int offset_mult_lambda
                );

////////////////////////////////////////////////////////////////////////////////
//! Perform final stream compaction before writing out data
////////////////////////////////////////////////////////////////////////////////
__device__
void
compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
                    const unsigned int num_threads_active,
                    unsigned int &offset_mult_lambda,
                    float *s_left, float *s_right,
                    unsigned short *s_left_count, unsigned short *s_right_count,
                    unsigned short *s_cl_one, unsigned short *s_cl_mult,
                    unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
                    unsigned int is_one_lambda, unsigned int is_one_lambda_2,
                    float &left, float &right, float &left_2, float &right_2,
                    unsigned int &left_count, unsigned int &right_count,
                    unsigned int &left_count_2, unsigned int &right_count_2,
                    unsigned int c_block_iend, unsigned int c_sum_block,
                    unsigned int c_block_iend_2, unsigned int c_sum_block_2,
                    cg::thread_block cta
                   );

////////////////////////////////////////////////////////////////////////////////
//! Perform scan to compact list of block start addresses
////////////////////////////////////////////////////////////////////////////////
__device__
void
scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
                              const unsigned int num_threads_compaction,
                              unsigned short *s_cl_blocking,
                              unsigned short *s_cl_helper,
                              cg::thread_block cta
                             );

////////////////////////////////////////////////////////////////////////////////
//! Perform scan to obtain number of eigenvalues before a specific block
////////////////////////////////////////////////////////////////////////////////
__device__
void
scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
              const unsigned int num_threads_active,
              const unsigned int num_threads_compaction,
              unsigned short *s_cl_blocking,
              unsigned short *s_cl_helper, cg::thread_block cta
             );

////////////////////////////////////////////////////////////////////////////////
//! Perform initial scan for compaction of intervals containing one and
//! multiple eigenvalues; also do initial scan to build blocks
////////////////////////////////////////////////////////////////////////////////
__device__
void
scanInitial(const unsigned int tid, const unsigned int tid_2,
            const unsigned int num_threads_active,
            const unsigned int num_threads_compaction,
            unsigned short *s_cl_one, unsigned short *s_cl_mult,
            unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
            cg::thread_block cta
           );

////////////////////////////////////////////////////////////////////////////////
//! Store all non-empty intervals resulting from the subdivision of the interval
//! currently processed by the thread
//! @param  addr  address where to store
////////////////////////////////////////////////////////////////////////////////
__device__
void
storeNonEmptyIntervalsLarge(unsigned int addr,
                            const unsigned int num_threads_active,
                            float  *s_left, float *s_right,
                            unsigned short  *s_left_count,
                            unsigned short *s_right_count,
                            float left, float mid, float right,
                            const unsigned short left_count,
                            const unsigned short mid_count,
                            const unsigned short right_count,
                            float epsilon,
                            unsigned int &compact_second_chunk,
                            unsigned short *s_compaction_list,
                            unsigned int &is_active_second
                           );

////////////////////////////////////////////////////////////////////////////////
//! Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
//! @param  g_d  diagonal elements in global memory
//! @param  g_s  superdiagonal elements in global elements (stored so that the
//!              element *(g_s - 1) can be accessed an equals 0
//! @param  n   size of matrix
//! @param  lg  lower bound of input interval (e.g. Gerschgorin interval)
//! @param  ug  upper bound of input interval (e.g. Gerschgorin interval)
//! @param  lg_eig_count  number of eigenvalues that are smaller than \a lg
//! @param  lu_eig_count  number of eigenvalues that are smaller than \a lu
//! @param  epsilon  desired accuracy of eigenvalues to compute
////////////////////////////////////////////////////////////////////////////////
__global__
void
bisectKernelLarge(float *g_d, float *g_s, const unsigned int n,
                  const float lg, const float ug,
                  const unsigned int lg_eig_count,
                  const unsigned int ug_eig_count,
                  float epsilon,
                  unsigned int *g_num_one,
                  unsigned int *g_num_blocks_mult,
                  float *g_left_one, float *g_right_one,
                  unsigned int *g_pos_one,
                  float *g_left_mult, float *g_right_mult,
                  unsigned int *g_left_count_mult,
                  unsigned int *g_right_count_mult,
                  unsigned int *g_blocks_mult,
                  unsigned int *g_blocks_mult_sum
                 )
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    const unsigned int tid = threadIdx.x;

    // intervals (store left and right because the subdivision tree is in general
    // not dense
    __shared__  float  s_left[2 * MAX_THREADS_BLOCK + 1];
    __shared__  float  s_right[2 * MAX_THREADS_BLOCK + 1];

    // number of eigenvalues that are smaller than s_left / s_right
    // (correspondence is realized via indices)
    __shared__  unsigned short  s_left_count[2 * MAX_THREADS_BLOCK + 1];
    __shared__  unsigned short  s_right_count[2 * MAX_THREADS_BLOCK + 1];

    // helper for stream compaction
    __shared__  unsigned short  s_compaction_list[2 * MAX_THREADS_BLOCK + 1];

    // state variables for whole block
    // if 0 then compaction of second chunk of child intervals is not necessary
    // (because all intervals had exactly one non-dead child)
    __shared__  unsigned int compact_second_chunk;
    // if 1 then all threads are converged
    __shared__  unsigned int all_threads_converged;

    // number of currently active threads
    __shared__  unsigned int num_threads_active;

    // number of threads to use for stream compaction
    __shared__  unsigned int num_threads_compaction;

    // helper for exclusive scan
    unsigned short *s_compaction_list_exc = s_compaction_list + 1;


    // variables for currently processed interval
    // left and right limit of active interval
    float left = 0.0f;
    float right = 0.0f;
    unsigned int left_count = 0;
    unsigned int right_count = 0;
    // midpoint of active interval
    float  mid = 0.0f;
    // number of eigenvalues smaller then mid
    unsigned int mid_count = 0;
    // helper for stream compaction (tracking of threads generating second child)
    unsigned int is_active_second = 0;

    // initialize lists
    s_compaction_list[tid] = 0;
    s_left[tid] = 0;
    s_right[tid] = 0;
    s_left_count[tid] = 0;
    s_right_count[tid] = 0;

    cg::sync(cta);

    // set up initial configuration
    if (0 == tid)
    {

        s_left[0] = lg;
        s_right[0] = ug;
        s_left_count[0] = lg_eig_count;
        s_right_count[0] = ug_eig_count;

        compact_second_chunk = 0;
        num_threads_active = 1;

        num_threads_compaction = 1;

        all_threads_converged = 1;
    }

    cg::sync(cta);

    // for all active threads read intervals from the last level
    // the number of (worst case) active threads per level l is 2^l
    while (true)
    {

        subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
                                num_threads_active,
                                left, right, left_count, right_count,
                                mid, all_threads_converged);

        cg::sync(cta);

        // check if done
        if (1 == all_threads_converged)
        {
            break;
        }

        // compute number of eigenvalues smaller than mid
        // use all threads for reading the necessary matrix data from global
        // memory
        // use s_left and s_right as scratch space for diagonal and
        // superdiagonal of matrix
        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
                                                    mid, threadIdx.x,
                                                    num_threads_active,
                                                    s_left, s_right,
                                                    (left == right), cta);

        cg::sync(cta);

        // store intervals
        // for all threads store the first child interval in a continuous chunk of
        // memory, and the second child interval -- if it exists -- in a second
        // chunk; it is likely that all threads reach convergence up to
        // \a epsilon at the same level; furthermore, for higher level most / all
        // threads will have only one child, storing the first child compactly will
        // (first) avoid to perform a compaction step on the first chunk, (second)
        // make it for higher levels (when all threads / intervals have
        // exactly one child)  unnecessary to perform a compaction of the second
        // chunk
        if (tid < num_threads_active)
        {

            if (left != right)
            {

                // store intervals
                storeNonEmptyIntervalsLarge(tid, num_threads_active,
                                            s_left, s_right,
                                            s_left_count, s_right_count,
                                            left, mid, right,
                                            left_count, mid_count, right_count,
                                            epsilon, compact_second_chunk,
                                            s_compaction_list_exc,
                                            is_active_second);
            }
            else
            {

                // re-write converged interval (has to be stored again because s_left
                // and s_right are used as scratch space for
                // computeNumSmallerEigenvalsLarge()
                s_left[tid] = left;
                s_right[tid] = left;
                s_left_count[tid] = left_count;
                s_right_count[tid] = right_count;

                is_active_second = 0;
            }
        }

        // necessary so that compact_second_chunk is up-to-date
        cg::sync(cta);

        // perform compaction of chunk where second children are stored
        // scan of (num_threads_active / 2) elements, thus at most
        // (num_threads_active / 4) threads are needed
        if (compact_second_chunk > 0)
        {

            // create indices for compaction
            createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, cta);

            compactIntervals(s_left, s_right, s_left_count, s_right_count,
                             mid, right, mid_count, right_count,
                             s_compaction_list, num_threads_active,
                             is_active_second);
        }

        cg::sync(cta);

        // update state variables
        if (0 == tid)
        {

            // update number of active threads with result of reduction
            num_threads_active += s_compaction_list[num_threads_active];
            num_threads_compaction = ceilPow2(num_threads_active);

            compact_second_chunk = 0;
            all_threads_converged = 1;
        }

        cg::sync(cta);

        if (num_threads_compaction > blockDim.x)
        {
            break;
        }
    }

    cg::sync(cta);

    // generate two lists of intervals; one with intervals that contain one
    // eigenvalue (or are converged), and one with intervals that need further
    // subdivision

    // perform two scans in parallel

    unsigned int left_count_2;
    unsigned int right_count_2;

    unsigned int tid_2 = tid + blockDim.x;

    // cache in per thread registers so that s_left_count and s_right_count
    // can be used for scans
    left_count = s_left_count[tid];
    right_count = s_right_count[tid];

    // some threads have to cache data for two intervals
    if (tid_2 < num_threads_active)
    {
        left_count_2 = s_left_count[tid_2];
        right_count_2 = s_right_count[tid_2];
    }

    // compaction list for intervals containing one and multiple eigenvalues
    // do not affect first element for exclusive scan
    unsigned short  *s_cl_one = s_left_count + 1;
    unsigned short  *s_cl_mult = s_right_count + 1;

    // compaction list for generating blocks of intervals containing multiple
    // eigenvalues
    unsigned short  *s_cl_blocking = s_compaction_list_exc;
    // helper compaction list for generating blocks of intervals
    __shared__ unsigned short  s_cl_helper[2 * MAX_THREADS_BLOCK + 1];

    if (0 == tid)
    {
        // set to 0 for exclusive scan
        s_left_count[0] = 0;
        s_right_count[0] = 0;
    }

    cg::sync(cta);

    // flag if interval contains one or multiple eigenvalues
    unsigned int is_one_lambda = 0;
    unsigned int is_one_lambda_2 = 0;

    // number of eigenvalues in the interval
    unsigned int multiplicity = right_count - left_count;
    is_one_lambda = (1 == multiplicity);

    s_cl_one[tid] = is_one_lambda;
    s_cl_mult[tid] = (! is_one_lambda);

    // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
    s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
    s_cl_helper[tid] = 0;

    if (tid_2 < num_threads_active)
    {

        unsigned int multiplicity = right_count_2 - left_count_2;
        is_one_lambda_2 = (1 == multiplicity);

        s_cl_one[tid_2] = is_one_lambda_2;
        s_cl_mult[tid_2] = (! is_one_lambda_2);

        // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
        s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
        s_cl_helper[tid_2] = 0;
    }
    else if (tid_2 < (2 * MAX_THREADS_BLOCK + 1))
    {

        // clear
        s_cl_blocking[tid_2] = 0;
        s_cl_helper[tid_2] = 0;
    }


    scanInitial(tid, tid_2, num_threads_active, num_threads_compaction,
                s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, cta);

    scanSumBlocks(tid, tid_2, num_threads_active,
                  num_threads_compaction, s_cl_blocking, s_cl_helper, cta);

    // end down sweep of scan
    cg::sync(cta);

    unsigned int  c_block_iend = 0;
    unsigned int  c_block_iend_2 = 0;
    unsigned int  c_sum_block = 0;
    unsigned int  c_sum_block_2 = 0;

    // for each thread / interval that corresponds to root node of interval block
    // store start address of block and total number of eigenvalues in all blocks
    // before this block (particular thread is irrelevant, constraint is to
    // have a subset of threads so that one and only one of them is in each
    // interval)
    if (1 == s_cl_helper[tid])
    {

        c_block_iend = s_cl_mult[tid] + 1;
        c_sum_block = s_cl_blocking[tid];
    }

    if (1 == s_cl_helper[tid_2])
    {

        c_block_iend_2 = s_cl_mult[tid_2] + 1;
        c_sum_block_2 = s_cl_blocking[tid_2];
    }

    scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
                                  s_cl_blocking, s_cl_helper, cta);


    // finished second scan for s_cl_blocking
    cg::sync(cta);

    // determine the global results
    __shared__  unsigned int num_blocks_mult;
    __shared__  unsigned int num_mult;
    __shared__  unsigned int offset_mult_lambda;

    if (0 == tid)
    {

        num_blocks_mult = s_cl_blocking[num_threads_active - 1];
        offset_mult_lambda = s_cl_one[num_threads_active - 1];
        num_mult = s_cl_mult[num_threads_active - 1];

        *g_num_one = offset_mult_lambda;
        *g_num_blocks_mult = num_blocks_mult;
    }

    cg::sync(cta);

    float left_2, right_2;
    --s_cl_one;
    --s_cl_mult;
    --s_cl_blocking;
    compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
                        s_left, s_right, s_left_count, s_right_count,
                        s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
                        is_one_lambda, is_one_lambda_2,
                        left, right, left_2, right_2,
                        left_count, right_count, left_count_2, right_count_2,
                        c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2, cta
                       );

    cg::sync(cta);

    // final adjustment before writing out data to global memory
    if (0 == tid)
    {
        s_cl_blocking[num_blocks_mult] = num_mult;
        s_cl_helper[0] = 0;
    }

    cg::sync(cta);

    // write to global memory
    writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
                g_left_one, g_right_one, g_pos_one,
                g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
                s_left, s_right, s_left_count, s_right_count,
                g_blocks_mult, g_blocks_mult_sum,
                s_compaction_list, s_cl_helper, offset_mult_lambda);
}

////////////////////////////////////////////////////////////////////////////////
//! Write data to global memory
////////////////////////////////////////////////////////////////////////////////
__device__
void writeToGmem(const unsigned int tid, const unsigned int tid_2,
                 const unsigned int num_threads_active,
                 const unsigned int num_blocks_mult,
                 float *g_left_one, float *g_right_one,
                 unsigned int *g_pos_one,
                 float *g_left_mult, float *g_right_mult,
                 unsigned int *g_left_count_mult,
                 unsigned int *g_right_count_mult,
                 float *s_left, float *s_right,
                 unsigned short *s_left_count, unsigned short *s_right_count,
                 unsigned int *g_blocks_mult,
                 unsigned int *g_blocks_mult_sum,
                 unsigned short *s_compaction_list,
                 unsigned short *s_cl_helper,
                 unsigned int offset_mult_lambda
                )
{

    if (tid < offset_mult_lambda)
    {

        g_left_one[tid] = s_left[tid];
        g_right_one[tid] = s_right[tid];
        // right count can be used to order eigenvalues without sorting
        g_pos_one[tid] = s_right_count[tid];
    }
    else
    {

        g_left_mult[tid - offset_mult_lambda] = s_left[tid];
        g_right_mult[tid - offset_mult_lambda] = s_right[tid];
        g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
        g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
    }

    if (tid_2 < num_threads_active)
    {

        if (tid_2 < offset_mult_lambda)
        {

            g_left_one[tid_2] = s_left[tid_2];
            g_right_one[tid_2] = s_right[tid_2];
            // right count can be used to order eigenvalues without sorting
            g_pos_one[tid_2] = s_right_count[tid_2];
        }
        else
        {

            g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
            g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
            g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
            g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
        }

    } // end writing out data

    // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
    // s_compaction_list we write the exclusive scan result
    if (tid <= num_blocks_mult)
    {
        g_blocks_mult[tid] = s_compaction_list[tid];
        g_blocks_mult_sum[tid] = s_cl_helper[tid];
    }

    if (tid_2 <= num_blocks_mult)
    {
        g_blocks_mult[tid_2] = s_compaction_list[tid_2];
        g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Perform final stream compaction before writing data to global memory
////////////////////////////////////////////////////////////////////////////////
__device__
void
compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
                    const unsigned int num_threads_active,
                    unsigned int &offset_mult_lambda,
                    float *s_left, float *s_right,
                    unsigned short *s_left_count, unsigned short *s_right_count,
                    unsigned short *s_cl_one, unsigned short *s_cl_mult,
                    unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
                    unsigned int is_one_lambda, unsigned int is_one_lambda_2,
                    float &left, float &right, float &left_2, float &right_2,
                    unsigned int &left_count, unsigned int &right_count,
                    unsigned int &left_count_2, unsigned int &right_count_2,
                    unsigned int c_block_iend, unsigned int c_sum_block,
                    unsigned int c_block_iend_2, unsigned int c_sum_block_2,
                    cg::thread_block cta
                   )
{
    // cache data before performing compaction
    left = s_left[tid];
    right = s_right[tid];

    if (tid_2 < num_threads_active)
    {

        left_2 = s_left[tid_2];
        right_2 = s_right[tid_2];
    }

    cg::sync(cta);

    // determine addresses for intervals containing multiple eigenvalues and
    // addresses for blocks of intervals
    unsigned int ptr_w = 0;
    unsigned int ptr_w_2 = 0;
    unsigned int ptr_blocking_w = 0;
    unsigned int ptr_blocking_w_2 = 0;

    ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
            : s_cl_mult[tid] + offset_mult_lambda;

    if (0 != c_block_iend)
    {
        ptr_blocking_w = s_cl_blocking[tid];
    }

    if (tid_2 < num_threads_active)
    {
        ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
                  : s_cl_mult[tid_2] + offset_mult_lambda;

        if (0 != c_block_iend_2)
        {
            ptr_blocking_w_2 = s_cl_blocking[tid_2];
        }
    }

    cg::sync(cta);

    // store compactly in shared mem
    s_left[ptr_w] = left;
    s_right[ptr_w] = right;
    s_left_count[ptr_w] = left_count;
    s_right_count[ptr_w] = right_count;

    if (0 != c_block_iend)
    {
        s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
        s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
    }

    if (tid_2 < num_threads_active)
    {

        // store compactly in shared mem
        s_left[ptr_w_2] = left_2;
        s_right[ptr_w_2] = right_2;
        s_left_count[ptr_w_2] = left_count_2;
        s_right_count[ptr_w_2] = right_count_2;

        if (0 != c_block_iend_2)
        {
            s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
            s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
        }
    }

}

////////////////////////////////////////////////////////////////////////////////
//! Compute addresses to obtain compact list of block start addresses
////////////////////////////////////////////////////////////////////////////////
__device__
void
scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
                              const unsigned int num_threads_compaction,
                              unsigned short *s_cl_blocking,
                              unsigned short *s_cl_helper, cg::thread_block cta
                             )
{
    // prepare for second step of block generation: compaction of the block
    // list itself to efficiently write out these
    s_cl_blocking[tid] = s_cl_helper[tid];

    if (tid_2 < num_threads_compaction)
    {
        s_cl_blocking[tid_2] = s_cl_helper[tid_2];
    }

    cg::sync(cta);

    // additional scan to compact s_cl_blocking that permits to generate a
    // compact list of eigenvalue blocks each one containing about
    // MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
    // processed by one thread block in a subsequent processing step

    unsigned int offset = 1;

    // build scan tree
    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
    {

        cg::sync(cta);

        if (tid < d)
        {

            unsigned int  ai = offset*(2*tid+1)-1;
            unsigned int  bi = offset*(2*tid+2)-1;
            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
        }

        offset <<= 1;
    }

    // traverse down tree: first down to level 2 across
    for (int d = 2; d < num_threads_compaction; d <<= 1)
    {

        offset >>= 1;
        cg::sync(cta);

        //
        if (tid < (d-1))
        {

            unsigned int  ai = offset*(tid+1) - 1;
            unsigned int  bi = ai + (offset >> 1);
            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
        }
    }

}

////////////////////////////////////////////////////////////////////////////////
//! Perform scan to obtain number of eigenvalues before a specific block
////////////////////////////////////////////////////////////////////////////////
__device__
void
scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
              const unsigned int num_threads_active,
              const unsigned int num_threads_compaction,
              unsigned short *s_cl_blocking,
              unsigned short *s_cl_helper, cg::thread_block cta)
{
    unsigned int offset = 1;

    // first step of scan to build the sum of elements within each block
    // build up tree
    for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
    {

        cg::sync(cta);

        if (tid < d)
        {

            unsigned int ai = offset*(2*tid+1)-1;
            unsigned int bi = offset*(2*tid+2)-1;

            s_cl_blocking[bi] += s_cl_blocking[ai];
        }

        offset *= 2;
    }

    // first step of scan to build the sum of elements within each block
    // traverse down tree
    for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
    {

        offset >>= 1;
        cg::sync(cta);

        if (tid < (d-1))
        {

            unsigned int ai = offset*(tid+1) - 1;
            unsigned int bi = ai + (offset >> 1);

            s_cl_blocking[bi] += s_cl_blocking[ai];
        }
    }

    cg::sync(cta);

    if (0 == tid)
    {

        // move last element of scan to last element that is valid
        // necessary because the number of threads employed for scan is a power
        // of two and not necessarily the number of active threasd
        s_cl_helper[num_threads_active - 1] =
            s_cl_helper[num_threads_compaction - 1];
        s_cl_blocking[num_threads_active - 1] =
            s_cl_blocking[num_threads_compaction - 1];
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Perform initial scan for compaction of intervals containing one and
//! multiple eigenvalues; also do initial scan to build blocks
////////////////////////////////////////////////////////////////////////////////
__device__
void
scanInitial(const unsigned int tid, const unsigned int tid_2,
            const unsigned int num_threads_active,
            const unsigned int num_threads_compaction,
            unsigned short *s_cl_one, unsigned short *s_cl_mult,
            unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
            cg::thread_block cta
           )
{

    // perform scan to compactly write out the intervals containing one and
    // multiple eigenvalues
    // also generate tree for blocking of intervals containing multiple
    // eigenvalues

    unsigned int offset = 1;

    // build scan tree
    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
    {

        cg::sync(cta);

        if (tid < d)
        {

            unsigned int  ai = offset*(2*tid+1);
            unsigned int  bi = offset*(2*tid+2)-1;

            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];

            // s_cl_helper is binary and zero for an internal node and 1 for a
            // root node of a tree corresponding to a block
            // s_cl_blocking contains the number of nodes in each sub-tree at each
            // iteration, the data has to be kept to compute the total number of
            // eigenvalues per block that, in turn, is needed to efficiently
            // write out data in the second step
            if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
            {

                // check how many childs are non terminated
                if (s_cl_helper[ai - 1] == 1)
                {
                    // mark as terminated
                    s_cl_helper[bi] = 1;
                }
                else if (s_cl_helper[bi] == 1)
                {
                    // mark as terminated
                    s_cl_helper[ai - 1] = 1;
                }
                else    // both childs are non-terminated
                {

                    unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];

                    if (temp > MAX_THREADS_BLOCK)
                    {

                        // the two child trees have to form separate blocks, terminate trees
                        s_cl_helper[ai - 1] = 1;
                        s_cl_helper[bi] = 1;
                    }
                    else
                    {
                        // build up tree by joining subtrees
                        s_cl_blocking[bi] = temp;
                        s_cl_blocking[ai - 1] = 0;
                    }
                }
            }  // end s_cl_helper update

        }

        offset <<= 1;
    }


    // traverse down tree, this only for stream compaction, not for block
    // construction
    for (int d = 2; d < num_threads_compaction; d <<= 1)
    {

        offset >>= 1;
        cg::sync(cta);

        //
        if (tid < (d-1))
        {

            unsigned int  ai = offset*(tid+1) - 1;
            unsigned int  bi = ai + (offset >> 1);

            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
        }
    }

}

////////////////////////////////////////////////////////////////////////////////
//! Store all non-empty intervals resulting from the subdivision of the interval
//! currently processed by the thread
////////////////////////////////////////////////////////////////////////////////
__device__
void
storeNonEmptyIntervalsLarge(unsigned int addr,
                            const unsigned int num_threads_active,
                            float  *s_left, float *s_right,
                            unsigned short  *s_left_count,
                            unsigned short *s_right_count,
                            float left, float mid, float right,
                            const unsigned short left_count,
                            const unsigned short mid_count,
                            const unsigned short right_count,
                            float epsilon,
                            unsigned int &compact_second_chunk,
                            unsigned short *s_compaction_list,
                            unsigned int &is_active_second)
{
    // check if both child intervals are valid
    if ((left_count != mid_count) && (mid_count != right_count))
    {

        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
                      left, mid, left_count, mid_count, epsilon);

        is_active_second = 1;
        s_compaction_list[threadIdx.x] = 1;
        atomicExch(&compact_second_chunk, 1);
    }
    else
    {

        // only one non-empty child interval

        // mark that no second child
        is_active_second = 0;
        s_compaction_list[threadIdx.x] = 0;

        // store the one valid child interval
        if (left_count != mid_count)
        {
            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
                          left, mid, left_count, mid_count, epsilon);
        }
        else
        {
            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
                          mid, right, mid_count, right_count, epsilon);
        }
    }
}

#endif // #ifndef _BISECT_KERNEL_LARGE_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Perform second step of bisection algorithm for large matrices for
 * intervals that contained after the first step more than one eigenvalue
 */

#ifndef _BISECT_KERNEL_LARGE_MULTI_H_
#define _BISECT_KERNEL_LARGE_MULTI_H_

#include <cooperative_groups.h>

namespace cg = cooperative_groups;
// includes, project
#include "config.h"
#include "util.h"

// additional kernel
#include "bisect_util.cu"

////////////////////////////////////////////////////////////////////////////////
//! Perform second step of bisection algorithm for large matrices for
//! intervals that after the first step contained more than one eigenvalue
//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
//! @param  n    matrix size
//! @param  blocks_mult  start addresses of blocks of intervals that are
//!                      processed by one block of threads, each of the
//!                      intervals contains more than one eigenvalue
//! @param  blocks_mult_sum  total number of eigenvalues / singleton intervals
//!                          in one block of intervals
//! @param  g_left  left limits of intervals
//! @param  g_right  right limits of intervals
//! @param  g_left_count  number of eigenvalues less than left limits
//! @param  g_right_count  number of eigenvalues less than right limits
//! @param  g_lambda  final eigenvalue
//! @param  g_pos  index of eigenvalue (in ascending order)
//! @param  precision  desired precision of eigenvalues
////////////////////////////////////////////////////////////////////////////////
__global__
void
bisectKernelLarge_MultIntervals(float *g_d, float *g_s, const unsigned int n,
                                unsigned int *blocks_mult,
                                unsigned int *blocks_mult_sum,
                                float *g_left, float *g_right,
                                unsigned int *g_left_count,
                                unsigned int *g_right_count,
                                float *g_lambda, unsigned int *g_pos,
                                float precision
                               )
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    const unsigned int tid = threadIdx.x;

    // left and right limits of interval
    __shared__  float  s_left[2 * MAX_THREADS_BLOCK];
    __shared__  float  s_right[2 * MAX_THREADS_BLOCK];

    // number of eigenvalues smaller than interval limits
    __shared__  unsigned int  s_left_count[2 * MAX_THREADS_BLOCK];
    __shared__  unsigned int  s_right_count[2 * MAX_THREADS_BLOCK];

    // helper array for chunk compaction of second chunk
    __shared__  unsigned int  s_compaction_list[2 * MAX_THREADS_BLOCK + 1];
    // compaction list helper for exclusive scan
    unsigned int *s_compaction_list_exc = s_compaction_list + 1;

    // flag if all threads are converged
    __shared__  unsigned int  all_threads_converged;
    // number of active threads
    __shared__  unsigned int  num_threads_active;
    // number of threads to employ for compaction
    __shared__  unsigned int  num_threads_compaction;
    // flag if second chunk has to be compacted
    __shared__  unsigned int compact_second_chunk;

    // parameters of block of intervals processed by this block of threads
    __shared__  unsigned int  c_block_start;
    __shared__  unsigned int  c_block_end;
    __shared__  unsigned int  c_block_offset_output;

    // midpoint of currently active interval of the thread
    float mid = 0.0f;
    // number of eigenvalues smaller than \a mid
    unsigned int  mid_count = 0;
    // current interval parameter
    float  left;
    float  right;
    unsigned int  left_count;
    unsigned int  right_count;
    // helper for compaction, keep track which threads have a second child
    unsigned int  is_active_second = 0;

    // initialize common start conditions
    if (0 == tid)
    {

        c_block_start = blocks_mult[blockIdx.x];
        c_block_end = blocks_mult[blockIdx.x + 1];
        c_block_offset_output = blocks_mult_sum[blockIdx.x];

        num_threads_active = c_block_end - c_block_start;
        s_compaction_list[0] = 0;
        num_threads_compaction = ceilPow2(num_threads_active);

        all_threads_converged = 1;
        compact_second_chunk = 0;
    }

    cg::sync(cta);

    // read data into shared memory
    if (tid < num_threads_active)
    {

        s_left[tid]  = g_left[c_block_start + tid];
        s_right[tid] = g_right[c_block_start + tid];
        s_left_count[tid]  = g_left_count[c_block_start + tid];
        s_right_count[tid] = g_right_count[c_block_start + tid];
    }

    cg::sync(cta);

    // do until all threads converged
    while (true)
    {
        //for (int iter=0; iter < 0; iter++) {

        // subdivide interval if currently active and not already converged
        subdivideActiveInterval(tid, s_left, s_right,
                                s_left_count, s_right_count,
                                num_threads_active,
                                left, right, left_count, right_count,
                                mid, all_threads_converged);

        cg::sync(cta);

        // stop if all eigenvalues have been found
        if (1 == all_threads_converged)
        {
            break;
        }

        // compute number of eigenvalues smaller than mid for active and not
        // converged intervals, use all threads for loading data from gmem and
        // s_left and s_right as scratch space to store the data load from gmem
        // in shared memory
        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
                                                    mid, tid, num_threads_active,
                                                    s_left, s_right,
                                                    (left == right), cta);

        cg::sync(cta);

        if (tid < num_threads_active)
        {

            // store intervals
            if (left != right)
            {

                storeNonEmptyIntervals(tid, num_threads_active,
                                       s_left, s_right, s_left_count, s_right_count,
                                       left, mid, right,
                                       left_count, mid_count, right_count,
                                       precision, compact_second_chunk,
                                       s_compaction_list_exc,
                                       is_active_second);
            }
            else
            {

                storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
                                       left, mid, right,
                                       left_count, mid_count, right_count,
                                       s_compaction_list_exc, compact_second_chunk,
                                       num_threads_active,
                                       is_active_second);

            }
        }

        cg::sync(cta);

        // compact second chunk of intervals if any of the threads generated
        // two child intervals
        if (1 == compact_second_chunk)
        {

            createIndicesCompaction(s_compaction_list_exc, num_threads_compaction, cta);

            compactIntervals(s_left, s_right, s_left_count, s_right_count,
                             mid, right, mid_count, right_count,
                             s_compaction_list, num_threads_active,
                             is_active_second);
        }

        cg::sync(cta);

        // update state variables
        if (0 == tid)
        {
            num_threads_active += s_compaction_list[num_threads_active];
            num_threads_compaction = ceilPow2(num_threads_active);

            compact_second_chunk = 0;
            all_threads_converged = 1;
        }

        cg::sync(cta);

        // clear
        s_compaction_list_exc[threadIdx.x] = 0;
        s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;

        cg::sync(cta);

    }  // end until all threads converged

    // write data back to global memory
    if (tid < num_threads_active)
    {

        unsigned int addr = c_block_offset_output + tid;

        g_lambda[addr]  = s_left[tid];
        g_pos[addr]   = s_right_count[tid];
    }
}

#endif // #ifndef _BISECT_KERNEL_LARGE_MULTI_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Computation of eigenvalues of a large symmetric, tridiagonal matrix */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

// includes, project
#include "helper_functions.h"
#include "helper_cuda.h"
#include "config.h"
#include "structs.h"
#include "util.h"
#include "matlab.h"

#include "bisect_large.cuh"

// includes, kernels
#include "bisect_kernel_large.cuh"
#include "bisect_kernel_large_onei.cuh"
#include "bisect_kernel_large_multi.cuh"


////////////////////////////////////////////////////////////////////////////////
//! Initialize variables and memory for result
//! @param  result handles to memory
//! @param  matrix_size  size of the matrix
////////////////////////////////////////////////////////////////////////////////
void
initResultDataLargeMatrix(ResultDataLarge &result, const unsigned int mat_size)
{

    // helper variables to initialize memory
    unsigned int zero = 0;
    unsigned int mat_size_f = sizeof(float) * mat_size;
    unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;

    float *tempf = (float *) malloc(mat_size_f);
    unsigned int *tempui = (unsigned int *) malloc(mat_size_ui);

    for (unsigned int i = 0; i < mat_size; ++i)
    {
        tempf[i] = 0.0f;
        tempui[i] = 0;
    }

    // number of intervals containing only one eigenvalue after the first step
    checkCudaErrors(cudaMalloc((void **) &result.g_num_one,
                               sizeof(unsigned int)));
    checkCudaErrors(cudaMemcpy(result.g_num_one, &zero, sizeof(unsigned int),
                               cudaMemcpyHostToDevice));

    // number of (thread) blocks of intervals with multiple eigenvalues after
    // the first iteration
    checkCudaErrors(cudaMalloc((void **) &result.g_num_blocks_mult,
                               sizeof(unsigned int)));
    checkCudaErrors(cudaMemcpy(result.g_num_blocks_mult, &zero,
                               sizeof(unsigned int),
                               cudaMemcpyHostToDevice));


    checkCudaErrors(cudaMalloc((void **) &result.g_left_one, mat_size_f));
    checkCudaErrors(cudaMalloc((void **) &result.g_right_one, mat_size_f));
    checkCudaErrors(cudaMalloc((void **) &result.g_pos_one, mat_size_ui));

    checkCudaErrors(cudaMalloc((void **) &result.g_left_mult, mat_size_f));
    checkCudaErrors(cudaMalloc((void **) &result.g_right_mult, mat_size_f));
    checkCudaErrors(cudaMalloc((void **) &result.g_left_count_mult,
                               mat_size_ui));
    checkCudaErrors(cudaMalloc((void **) &result.g_right_count_mult,
                               mat_size_ui));

    checkCudaErrors(cudaMemcpy(result.g_left_one, tempf, mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_right_one, tempf, mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_pos_one, tempui, mat_size_ui,
                               cudaMemcpyHostToDevice));

    checkCudaErrors(cudaMemcpy(result.g_left_mult, tempf, mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_right_mult, tempf, mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_left_count_mult, tempui, mat_size_ui,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(result.g_right_count_mult, tempui, mat_size_ui,
                               cudaMemcpyHostToDevice));

    checkCudaErrors(cudaMalloc((void **) &result.g_blocks_mult, mat_size_ui));
    checkCudaErrors(cudaMemcpy(result.g_blocks_mult, tempui, mat_size_ui,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMalloc((void **) &result.g_blocks_mult_sum, mat_size_ui));
    checkCudaErrors(cudaMemcpy(result.g_blocks_mult_sum, tempui, mat_size_ui,
                               cudaMemcpyHostToDevice));

    checkCudaErrors(cudaMalloc((void **) &result.g_lambda_mult, mat_size_f));
    checkCudaErrors(cudaMemcpy(result.g_lambda_mult, tempf, mat_size_f,
                               cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMalloc((void **) &result.g_pos_mult, mat_size_ui));
    checkCudaErrors(cudaMemcpy(result.g_pos_mult, tempf, mat_size_ui,
                               cudaMemcpyHostToDevice));
}

////////////////////////////////////////////////////////////////////////////////
//! Cleanup result memory
//! @param result  handles to memory
////////////////////////////////////////////////////////////////////////////////
void
cleanupResultDataLargeMatrix(ResultDataLarge &result)
{

    checkCudaErrors(cudaFree(result.g_num_one));
    checkCudaErrors(cudaFree(result.g_num_blocks_mult));
    checkCudaErrors(cudaFree(result.g_left_one));
    checkCudaErrors(cudaFree(result.g_right_one));
    checkCudaErrors(cudaFree(result.g_pos_one));
    checkCudaErrors(cudaFree(result.g_left_mult));
    checkCudaErrors(cudaFree(result.g_right_mult));
    checkCudaErrors(cudaFree(result.g_left_count_mult));
    checkCudaErrors(cudaFree(result.g_right_count_mult));
    checkCudaErrors(cudaFree(result.g_blocks_mult));
    checkCudaErrors(cudaFree(result.g_blocks_mult_sum));
    checkCudaErrors(cudaFree(result.g_lambda_mult));
    checkCudaErrors(cudaFree(result.g_pos_mult));
}

////////////////////////////////////////////////////////////////////////////////
//! Run the kernels to compute the eigenvalues for large matrices
//! @param  input   handles to input data
//! @param  result  handles to result data
//! @param  mat_size  matrix size
//! @param  precision  desired precision of eigenvalues
//! @param  lg  lower limit of Gerschgorin interval
//! @param  ug  upper limit of Gerschgorin interval
//! @param  iterations  number of iterations (for timing)
////////////////////////////////////////////////////////////////////////////////
void
computeEigenvaluesLargeMatrix(const InputData &input, const ResultDataLarge &result,
                              const unsigned int mat_size, const float precision,
                              const float lg, const float ug,
                              const unsigned int iterations)
{
    dim3  blocks(1, 1, 1);
    dim3  threads(MAX_THREADS_BLOCK, 1, 1);

    StopWatchInterface *timer_step1 = NULL;
    StopWatchInterface *timer_step2_one = NULL;
    StopWatchInterface *timer_step2_mult = NULL;
    StopWatchInterface *timer_total = NULL;
    sdkCreateTimer(&timer_step1);
    sdkCreateTimer(&timer_step2_one);
    sdkCreateTimer(&timer_step2_mult);
    sdkCreateTimer(&timer_total);

    sdkStartTimer(&timer_total);

    // do for multiple iterations to improve timing accuracy
    for (unsigned int iter = 0; iter < iterations; ++iter)
    {

        sdkStartTimer(&timer_step1);
        bisectKernelLarge<<< blocks, threads >>>
        (input.g_a, input.g_b, mat_size,
         lg, ug, 0, mat_size, precision,
         result.g_num_one, result.g_num_blocks_mult,
         result.g_left_one, result.g_right_one, result.g_pos_one,
         result.g_left_mult, result.g_right_mult,
         result.g_left_count_mult, result.g_right_count_mult,
         result.g_blocks_mult, result.g_blocks_mult_sum
        );

        getLastCudaError("Kernel launch failed.");
        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timer_step1);

        // get the number of intervals containing one eigenvalue after the first
        // processing step
        unsigned int num_one_intervals;
        checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one,
                                   sizeof(unsigned int),
                                   cudaMemcpyDeviceToHost));

        dim3 grid_onei;
        grid_onei.x = getNumBlocksLinear(num_one_intervals, MAX_THREADS_BLOCK);
        dim3 threads_onei;
        // use always max number of available threads to better balance load times
        // for matrix data
        threads_onei.x = MAX_THREADS_BLOCK;

        // compute eigenvalues for intervals that contained only one eigenvalue
        // after the first processing step
        sdkStartTimer(&timer_step2_one);

        bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>>
        (input.g_a, input.g_b, mat_size, num_one_intervals,
         result.g_left_one, result.g_right_one, result.g_pos_one,
         precision
        );

        getLastCudaError("bisectKernelLarge_OneIntervals() FAILED.");
        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timer_step2_one);

        // process intervals that contained more than one eigenvalue after
        // the first processing step

        // get the number of blocks of intervals that contain, in total when
        // each interval contains only one eigenvalue, not more than
        // MAX_THREADS_BLOCK threads
        unsigned int  num_blocks_mult = 0;
        checkCudaErrors(cudaMemcpy(&num_blocks_mult, result.g_num_blocks_mult,
                                   sizeof(unsigned int),
                                   cudaMemcpyDeviceToHost));

        // setup the execution environment
        dim3  grid_mult(num_blocks_mult, 1, 1);
        dim3  threads_mult(MAX_THREADS_BLOCK, 1, 1);

        sdkStartTimer(&timer_step2_mult);

        bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>>
        (input.g_a, input.g_b, mat_size,
         result.g_blocks_mult, result.g_blocks_mult_sum,
         result.g_left_mult, result.g_right_mult,
         result.g_left_count_mult, result.g_right_count_mult,
         result.g_lambda_mult, result.g_pos_mult,
         precision
        );


        getLastCudaError("bisectKernelLarge_MultIntervals() FAILED.");
        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&timer_step2_mult);

    }

    sdkStopTimer(&timer_total);

    printf("Average time step 1: %f ms\n",
           sdkGetTimerValue(&timer_step1) / (float) iterations);
    printf("Average time step 2, one intervals: %f ms\n",
           sdkGetTimerValue(&timer_step2_one) / (float) iterations);
    printf("Average time step 2, mult intervals: %f ms\n",
           sdkGetTimerValue(&timer_step2_mult) / (float) iterations);

    printf("Average time TOTAL: %f ms\n",
           sdkGetTimerValue(&timer_total) / (float) iterations);

    sdkDeleteTimer(&timer_step1);
    sdkDeleteTimer(&timer_step2_one);
    sdkDeleteTimer(&timer_step2_mult);
    sdkDeleteTimer(&timer_total);
}

////////////////////////////////////////////////////////////////////////////////
//! Process the result, that is obtain result from device and do simple sanity
//! checking
//! @param  input   handles to input data
//! @param  result  handles to result data
//! @param  mat_size  matrix size
//! @param  filename  output filename
////////////////////////////////////////////////////////////////////////////////
bool
processResultDataLargeMatrix(const InputData &input, const ResultDataLarge &result,
                             const unsigned int mat_size,
                             const char *filename,
                             const unsigned int user_defined, char *exec_path)
{
    bool bCompareResult = false;
    const unsigned int mat_size_ui = sizeof(unsigned int) * mat_size;
    const unsigned int mat_size_f  = sizeof(float) * mat_size;

    // copy data from intervals that contained more than one eigenvalue after
    // the first processing step
    float *lambda_mult = (float *) malloc(sizeof(float) * mat_size);
    checkCudaErrors(cudaMemcpy(lambda_mult, result.g_lambda_mult,
                               sizeof(float) * mat_size,
                               cudaMemcpyDeviceToHost));
    unsigned int *pos_mult =
        (unsigned int *) malloc(sizeof(unsigned int) * mat_size);
    checkCudaErrors(cudaMemcpy(pos_mult, result.g_pos_mult,
                               sizeof(unsigned int) * mat_size,
                               cudaMemcpyDeviceToHost));

    unsigned int *blocks_mult_sum =
        (unsigned int *) malloc(sizeof(unsigned int) * mat_size);
    checkCudaErrors(cudaMemcpy(blocks_mult_sum, result.g_blocks_mult_sum,
                               sizeof(unsigned int) * mat_size,
                               cudaMemcpyDeviceToHost));

    unsigned int num_one_intervals;
    checkCudaErrors(cudaMemcpy(&num_one_intervals, result.g_num_one,
                               sizeof(unsigned int),
                               cudaMemcpyDeviceToHost));

    unsigned int sum_blocks_mult = mat_size - num_one_intervals;


    // copy data for intervals that contained one eigenvalue after the first
    // processing step
    float *left_one = (float *) malloc(mat_size_f);
    float *right_one = (float *) malloc(mat_size_f);
    unsigned int *pos_one = (unsigned int *) malloc(mat_size_ui);
    checkCudaErrors(cudaMemcpy(left_one, result.g_left_one, mat_size_f,
                               cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(right_one, result.g_right_one, mat_size_f,
                               cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(pos_one, result.g_pos_one, mat_size_ui,
                               cudaMemcpyDeviceToHost));

    // extract eigenvalues
    float *eigenvals = (float *) malloc(mat_size_f);

    // singleton intervals generated in the second step
    for (unsigned int i = 0; i < sum_blocks_mult; ++i)
    {

        eigenvals[pos_mult[i] - 1] = lambda_mult[i];
    }

    // singleton intervals generated in the first step
    unsigned int index = 0;

    for (unsigned int i = 0; i < num_one_intervals; ++i, ++index)
    {

        eigenvals[pos_one[i] - 1] = left_one[i];
    }

    if (1 == user_defined)
    {
        // store result
        writeTridiagSymMatlab(filename, input.a, input.b+1, eigenvals, mat_size);
        // getLastCudaError( sdkWriteFilef( filename, eigenvals, mat_size, 0.0f));

        printf("User requests non-default argument(s), skipping self-check!\n");
        bCompareResult = true;
    }
    else
    {

        // compare with reference solution

        float *reference = NULL;
        unsigned int input_data_size = 0;

        char *ref_path = sdkFindFilePath("reference.dat", exec_path);
        assert(NULL != ref_path);
        sdkReadFile(ref_path, &reference, &input_data_size, false);
        assert(input_data_size == mat_size);

        // there's an imprecision of Sturm count computation which makes an
        // additional offset necessary
        float tolerance = 1.0e-5f + 5.0e-6f;

        if (sdkCompareL2fe(reference, eigenvals, mat_size, tolerance) == true)
        {
            bCompareResult = true;
        }
        else
        {
            bCompareResult = false;
        }

        free(ref_path);
        free(reference);
    }

    freePtr(eigenvals);
    freePtr(lambda_mult);
    freePtr(pos_mult);
    freePtr(blocks_mult_sum);
    freePtr(left_one);
    freePtr(right_one);
    freePtr(pos_one);

    return bCompareResult;
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Determine eigenvalues for large matrices for intervals that contained after
 * the first step one eigenvalue
 */

#ifndef _BISECT_KERNEL_LARGE_ONEI_H_
#define _BISECT_KERNEL_LARGE_ONEI_H_

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// includes, project
#include "config.h"
#include "util.h"

// additional kernel
#include "bisect_util.cu"

////////////////////////////////////////////////////////////////////////////////
//! Determine eigenvalues for large matrices for intervals that after
//! the first step contained one eigenvalue
//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
//! @param  n    matrix size
//! @param  num_intervals  total number of intervals containing one eigenvalue
//!                         after the first step
//! @param g_left  left interval limits
//! @param g_right  right interval limits
//! @param g_pos  index of interval / number of intervals that are smaller than
//!               right interval limit
//! @param  precision  desired precision of eigenvalues
////////////////////////////////////////////////////////////////////////////////
__global__
void
bisectKernelLarge_OneIntervals(float *g_d, float *g_s, const unsigned int n,
                               unsigned int num_intervals,
                               float *g_left, float *g_right,
                               unsigned int *g_pos,
                               float  precision)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;

    __shared__  float  s_left_scratch[MAX_THREADS_BLOCK];
    __shared__  float  s_right_scratch[MAX_THREADS_BLOCK];

    // active interval of thread
    // left and right limit of current interval
    float left, right;
    // number of threads smaller than the right limit (also corresponds to the
    // global index of the eigenvalues contained in the active interval)
    unsigned int right_count;
    // flag if current thread converged
    unsigned int converged = 0;
    // midpoint when current interval is subdivided
    float mid = 0.0f;
    // number of eigenvalues less than mid
    unsigned int mid_count = 0;

    // read data from global memory
    if (gtid < num_intervals)
    {
        left = g_left[gtid];
        right = g_right[gtid];
        right_count = g_pos[gtid];
    }


    // flag to determine if all threads converged to eigenvalue
    __shared__  unsigned int  converged_all_threads;

    // initialized shared flag
    if (0 == threadIdx.x)
    {
        converged_all_threads = 0;
    }

    cg::sync(cta);

    // process until all threads converged to an eigenvalue
    // while( 0 == converged_all_threads) {
    while (true)
    {

        atomicExch(&converged_all_threads, 1);

        // update midpoint for all active threads
        if ((gtid < num_intervals) && (0 == converged))
        {

            mid = computeMidpoint(left, right);
        }

        // find number of eigenvalues that are smaller than midpoint
        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
                                                    mid, gtid, num_intervals,
                                                    s_left_scratch,
                                                    s_right_scratch,
                                                    converged, cta);

        cg::sync(cta);

        // for all active threads
        if ((gtid < num_intervals) && (0 == converged))
        {

            // udpate intervals -- always one child interval survives
            if (right_count == mid_count)
            {
                right = mid;
            }
            else
            {
                left = mid;
            }

            // check for convergence
            float t0 = right - left;
            float t1 = max(abs(right), abs(left)) * precision;

            if (t0 < min(precision, t1))
            {

                float lambda = computeMidpoint(left, right);
                left = lambda;
                right = lambda;

                converged = 1;
            }
            else
            {
                atomicExch(&converged_all_threads, 0);
            }
        }

        cg::sync(cta);

        if (1 == converged_all_threads)
        {
            break;
        }

        cg::sync(cta);
    }

    // write data back to global memory
    cg::sync(cta);

    if (gtid < num_intervals)
    {
        // intervals converged so left and right interval limit are both identical
        // and identical to the eigenvalue
        g_left[gtid] = left;
    }

}

#endif // #ifndef _BISECT_KERNEL_LARGE_ONEI_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample calculates scalar products of a
 * given set of input vector pairs
 */


#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

#include <helper_functions.h>
#include <helper_cuda.h>

///////////////////////////////////////////////////////////////////////////////
// Calculate scalar products of VectorN vectors of ElementN elements on CPU
///////////////////////////////////////////////////////////////////////////////
extern "C"
void scalarProdCPU(
    float *h_C,
    float *h_A,
    float *h_B,
    int vectorN,
    int elementN
);


///////////////////////////////////////////////////////////////////////////////
// Calculate scalar products of VectorN vectors of ElementN elements on GPU
///////////////////////////////////////////////////////////////////////////////
#include "scalarProd_kernel.cuh"


////////////////////////////////////////////////////////////////////////////////
// Helper function, returning uniformly distributed
// random float in [low, high] range
////////////////////////////////////////////////////////////////////////////////
float RandFloat(float low, float high)
{
    float t = (float)rand() / (float)RAND_MAX;
    return (1.0f - t) * low + t * high;
}


///////////////////////////////////////////////////////////////////////////////
// Data configuration
///////////////////////////////////////////////////////////////////////////////

//Total number of input vector pairs; arbitrary
const int VECTOR_N = 256;
//Number of elements per vector; arbitrary,
//but strongly preferred to be a multiple of warp size
//to meet memory coalescing constraints
const int ELEMENT_N = 4096;
//Total number of data elements
const int    DATA_N = VECTOR_N * ELEMENT_N;

const int   DATA_SZ = DATA_N * sizeof(float);
const int RESULT_SZ = VECTOR_N  * sizeof(float);


///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    float *h_A, *h_B, *h_C_CPU, *h_C_GPU;
    float *d_A, *d_B, *d_C;
    double delta, ref, sum_delta, sum_ref, L1norm;
    StopWatchInterface *hTimer = NULL;
    int i;

    printf("%s Starting...\n\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    printf("...allocating CPU memory.\n");
    h_A     = (float *)malloc(DATA_SZ);
    h_B     = (float *)malloc(DATA_SZ);
    h_C_CPU = (float *)malloc(RESULT_SZ);
    h_C_GPU = (float *)malloc(RESULT_SZ);

    printf("...allocating GPU memory.\n");
    checkCudaErrors(cudaMalloc((void **)&d_A, DATA_SZ));
    checkCudaErrors(cudaMalloc((void **)&d_B, DATA_SZ));
    checkCudaErrors(cudaMalloc((void **)&d_C, RESULT_SZ));

    printf("...generating input data in CPU mem.\n");
    srand(123);

    //Generating input data on CPU
    for (i = 0; i < DATA_N; i++)
    {
        h_A[i] = RandFloat(0.0f, 1.0f);
        h_B[i] = RandFloat(0.0f, 1.0f);
    }

    printf("...copying input data to GPU mem.\n");
    //Copy options data to GPU memory for further processing
    checkCudaErrors(cudaMemcpy(d_A, h_A, DATA_SZ, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, DATA_SZ, cudaMemcpyHostToDevice));
    printf("Data init done.\n");


    printf("Executing GPU kernel...\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    scalarProdGPU<<<128, 256>>>(d_C, d_A, d_B, VECTOR_N, ELEMENT_N);
    getLastCudaError("scalarProdGPU() execution failed\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    printf("GPU time: %f msecs.\n", sdkGetTimerValue(&hTimer));

    printf("Reading back GPU result...\n");
    //Read back GPU results to compare them to CPU results
    checkCudaErrors(cudaMemcpy(h_C_GPU, d_C, RESULT_SZ, cudaMemcpyDeviceToHost));


    printf("Checking GPU results...\n");
    printf("..running CPU scalar product calculation\n");
    scalarProdCPU(h_C_CPU, h_A, h_B, VECTOR_N, ELEMENT_N);

    printf("...comparing the results\n");
    //Calculate max absolute difference and L1 distance
    //between CPU and GPU results
    sum_delta = 0;
    sum_ref   = 0;

    for (i = 0; i < VECTOR_N; i++)
    {
        delta = fabs(h_C_GPU[i] - h_C_CPU[i]);
        ref   = h_C_CPU[i];
        sum_delta += delta;
        sum_ref   += ref;
    }

    L1norm = sum_delta / sum_ref;

    printf("Shutting down...\n");
    checkCudaErrors(cudaFree(d_C));
    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_A));
    free(h_C_GPU);
    free(h_C_CPU);
    free(h_B);
    free(h_A);
    sdkDeleteTimer(&hTimer);

    printf("L1 error: %E\n", L1norm);
    printf((L1norm < 1e-6) ? "Test passed\n" : "Test failed!\n");
    exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

///////////////////////////////////////////////////////////////////////////////
// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
// (the same as for floating point  multiplication and addition),
// whereas full 32-bit multiplication takes 16 clocks per warp.
// So if integer multiplication operands are  guaranteed to fit into 24 bits
// (always lie within [-8M, 8M - 1] range in signed case),
// explicit 24-bit multiplication is preferred for performance.
///////////////////////////////////////////////////////////////////////////////
#define IMUL(a, b) __mul24(a, b)


///////////////////////////////////////////////////////////////////////////////
// Calculate scalar products of VectorN vectors of ElementN elements on GPU
// Parameters restrictions:
// 1) ElementN is strongly preferred to be a multiple of warp size to
//    meet alignment constraints of memory coalescing.
// 2) ACCUM_N must be a power of two.
///////////////////////////////////////////////////////////////////////////////
#define ACCUM_N 1024
__global__ void scalarProdGPU(
    float *d_C,
    float *d_A,
    float *d_B,
    int vectorN,
    int elementN
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //Accumulators cache
    __shared__ float accumResult[ACCUM_N];

    ////////////////////////////////////////////////////////////////////////////
    // Cycle through every pair of vectors,
    // taking into account that vector counts can be different
    // from total number of thread blocks
    ////////////////////////////////////////////////////////////////////////////
    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
    {
        int vectorBase = IMUL(elementN, vec);
        int vectorEnd  = vectorBase + elementN;

        ////////////////////////////////////////////////////////////////////////
        // Each accumulator cycles through vectors with
        // stride equal to number of total number of accumulators ACCUM_N
        // At this stage ACCUM_N is only preferred be a multiple of warp size
        // to meet memory coalescing alignment constraints.
        ////////////////////////////////////////////////////////////////////////
        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
        {
            float sum = 0;

            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
                sum += d_A[pos] * d_B[pos];

            accumResult[iAccum] = sum;
        }

        ////////////////////////////////////////////////////////////////////////
        // Perform tree-like reduction of accumulators' results.
        // ACCUM_N has to be power of two at this stage
        ////////////////////////////////////////////////////////////////////////
        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);

            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
                accumResult[iAccum] += accumResult[stride + iAccum];
        }

        cg::sync(cta);

        if (threadIdx.x == 0) d_C[vec] = accumResult[0];
    }
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// This sample is an implementation of a simple line-of-sight algorithm:
// Given a height map and a ray originating at some observation point,
// it computes all the points along the ray that are visible from the
// observation point.
// It is based on the description made in "Guy E. Blelloch.  Vector models
// for data-parallel computing. MIT Press, 1990" and uses open source CUDA
// Thrust Library

#ifdef _WIN32
#  define NOMINMAX
#endif

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>

// includes, project
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_math.h>

// includes, library
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>

////////////////////////////////////////////////////////////////////////////////
// declaration, types

// Boolean
typedef unsigned char Bool;
enum
{
    False = 0,
    True = 1
};

// 2D height field
struct HeightField
{
    int     width;
    float  *height;
};

// Ray
struct Ray
{
    float3 origin;
    float2 dir;
    int    length;
    float  oneOverLength;
};

////////////////////////////////////////////////////////////////////////////////
// declaration, variables

// Height field texture reference
texture<float, 2, cudaReadModeElementType> g_HeightFieldTex;

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(int argc, char **argv);
__global__ void computeAngles_kernel(const Ray, float *);
__global__ void computeVisibilities_kernel(const float *, const float *, int, Bool *);
void lineOfSight_gold(const HeightField, const Ray, Bool *);
__device__ __host__ float2 getLocation(const Ray, int);
__device__ __host__ float getAngle(const Ray, float2, float);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    int res = runTest(argc, argv);

    if (res != 1)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a line-of-sight test for CUDA
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, char **argv)
{
    ////////////////////////////////////////////////////////////////////////////
    // Device initialization

    printf("[%s] - Starting...\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    ////////////////////////////////////////////////////////////////////////////
    // Timer

    // Create
    StopWatchInterface *timer;
    sdkCreateTimer(&timer);

    // Number of iterations to get accurate timing
    uint numIterations = 100;

    ////////////////////////////////////////////////////////////////////////////
    // Height field

    HeightField heightField;

    // Allocate in host memory
    int2 dim = make_int2(10000, 100);
    heightField.width = dim.x;
    thrust::host_vector<float> height(dim.x * dim.y);
    heightField.height = (float *)&height[0];

    //
    // Fill in with an arbitrary sine surface
    for (int x = 0; x < dim.x; ++x)
        for (int y = 0; y < dim.y; ++y)
        {
            float amp = 0.1f * (x + y);
            float period = 2.0f + amp;
            *(heightField.height + dim.x * y + x) =
                amp * (sinf(sqrtf((float)(x * x + y * y)) * 2.0f * 3.1416f / period) + 1.0f);
        }

    // Allocate CUDA array in device memory
    cudaChannelFormatDesc channelDesc =
        cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray *heightFieldArray;
    checkCudaErrors(cudaMallocArray(&heightFieldArray, &channelDesc, dim.x, dim.y));

    // Initialize device memory
    checkCudaErrors(cudaMemcpyToArray(heightFieldArray, 0, 0, heightField.height,
                                      dim.x * dim.y * sizeof(float), cudaMemcpyHostToDevice));

    // Set texture parameters
    g_HeightFieldTex.addressMode[0] = cudaAddressModeClamp;
    g_HeightFieldTex.addressMode[1] = cudaAddressModeClamp;
    g_HeightFieldTex.filterMode = cudaFilterModePoint;
    g_HeightFieldTex.normalized = 0;

    // Bind CUDA array to texture reference
    checkCudaErrors(cudaBindTextureToArray(g_HeightFieldTex, heightFieldArray,
                                           channelDesc));

    ////////////////////////////////////////////////////////////////////////////
    // Ray (starts at origin and traverses the height field diagonally)

    Ray ray;
    ray.origin = make_float3(0, 0, 2.0f);
    int2 dir = make_int2(dim.x - 1, dim.y - 1);
    ray.dir = make_float2((float)dir.x, (float)dir.y);
    ray.length = max(abs(dir.x), abs(dir.y));
    ray.oneOverLength = 1.0f / ray.length;

    ////////////////////////////////////////////////////////////////////////////
    // View angles

    // Allocate view angles for each point along the ray
    thrust::device_vector<float> d_angles(ray.length);

    // Allocate result of max-scan operation on the array of view angles
    thrust::device_vector<float> d_scannedAngles(ray.length);

    ////////////////////////////////////////////////////////////////////////////
    // Visibility results

    // Allocate visibility results for each point along the ray
    thrust::device_vector<Bool> d_visibilities(ray.length);
    thrust::host_vector<Bool> h_visibilities(ray.length);
    thrust::host_vector<Bool> h_visibilitiesRef(ray.length);

    ////////////////////////////////////////////////////////////////////////////
    // Reference solution
    lineOfSight_gold(heightField, ray, (Bool *)&h_visibilitiesRef[0]);

    ////////////////////////////////////////////////////////////////////////////
    // Device solution

    // Execution configuration
    dim3 block(256);
    dim3 grid((uint)ceil(ray.length / (double)block.x));

    // Compute device solution
    printf("Line of sight\n");
    sdkStartTimer(&timer);

    for (uint i = 0; i < numIterations; ++i)
    {

        // Compute view angle for each point along the ray
        computeAngles_kernel<<<grid, block>>>(ray, thrust::raw_pointer_cast(&d_angles[0]));
        getLastCudaError("Kernel execution failed");

        // Perform a max-scan operation on the array of view angles
        thrust::inclusive_scan(d_angles.begin(), d_angles.end(), d_scannedAngles.begin(), thrust::maximum<float>());
        getLastCudaError("Kernel execution failed");

        // Compute visibility results based on the array of view angles
        // and its scanned version
        computeVisibilities_kernel<<<grid, block>>>(thrust::raw_pointer_cast(&d_angles[0]),
                                                    thrust::raw_pointer_cast(&d_scannedAngles[0]),
                                                    ray.length,
                                                    thrust::raw_pointer_cast(&d_visibilities[0]));
        getLastCudaError("Kernel execution failed");
    }

    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    getLastCudaError("Kernel execution failed");

    // Copy visibility results back to the host
    thrust::copy(d_visibilities.begin(), d_visibilities.end(), h_visibilities.begin());

    // Compare device visibility results against reference results
    bool res = compareData(thrust::raw_pointer_cast(&h_visibilitiesRef[0]),
                           thrust::raw_pointer_cast(&h_visibilities[0]), ray.length, 0.0f, 0.0f);
    printf("Average time: %f ms\n\n", sdkGetTimerValue(&timer) / numIterations);
    sdkResetTimer(&timer);

    // Cleanup memory
    checkCudaErrors(cudaFreeArray(heightFieldArray));
    return res;
}

////////////////////////////////////////////////////////////////////////////////
//! Compute view angles for each point along the ray
//! @param ray         ray
//! @param angles      view angles
////////////////////////////////////////////////////////////////////////////////
__global__ void computeAngles_kernel(const Ray ray, float *angles)
{
    uint i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < ray.length)
    {
        float2 location = getLocation(ray, i + 1);
        float height = tex2D(g_HeightFieldTex, location.x, location.y);
        float angle = getAngle(ray, location, height);
        angles[i] = angle;
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute visibility for each point along the ray
//! @param angles          view angles
//! @param scannedAngles   max-scanned view angles
//! @param numAngles       number of view angles
//! @param visibilities    boolean array indicating the visibility of each point
//!                        along the ray
////////////////////////////////////////////////////////////////////////////////
__global__ void computeVisibilities_kernel(const float *angles,
                                           const float *scannedAngles,
                                           int numAngles,
                                           Bool *visibilities)
{
    uint i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numAngles)
    {
        visibilities[i] = scannedAngles[i] <= angles[i];
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! @param heightField     height field
//! @param ray             ray
//! @param visibilities    boolean array indicating the visibility of each point
//!                        along the ray
////////////////////////////////////////////////////////////////////////////////
void lineOfSight_gold(const HeightField heightField, const Ray ray,
                      Bool *visibilities)
{
    float angleMax = asinf(-1.0f);

    for (int i = 0; i < ray.length; ++i)
    {
        float2 location = getLocation(ray, i + 1);
        float height = *(heightField.height
                         + heightField.width * (int)floorf(location.y)
                         + (int)floorf(location.x));
        float angle = getAngle(ray, location, height);

        if (angle > angleMax)
        {
            angleMax = angle;
            visibilities[i] = True;
        }
        else
        {
            visibilities[i] = False;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute the 2D coordinates of the point located at i steps from the origin
//! of the ray
//! @param ray      ray
//! @param i        integer offset along the ray
////////////////////////////////////////////////////////////////////////////////
__device__ __host__ float2 getLocation(const Ray ray, int i)
{
    float step = i * ray.oneOverLength;
    return make_float2(ray.origin.x, ray.origin.y) + ray.dir * step;
}

////////////////////////////////////////////////////////////////////////////////
//! Compute the angle of view between a 3D point and the origin of the ray
//! @param ray        ray
//! @param location   2D coordinates of the input point
//! @param height     height of the input point
////////////////////////////////////////////////////////////////////////////////
__device__ __host__ float getAngle(const Ray ray, float2 location, float height)
{
    float2 dir = location - make_float2(ray.origin.x, ray.origin.y);
    return atanf((height - ray.origin.z) / length(dir));
}
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>

////////////////////////////////////////////////////////////////////////////////
// A structure of 2D points (structure of arrays).
////////////////////////////////////////////////////////////////////////////////
class Points
{
        float *m_x;
        float *m_y;

    public:
        // Constructor.
        __host__ __device__ Points() : m_x(NULL), m_y(NULL) {}

        // Constructor.
        __host__ __device__ Points(float *x, float *y) : m_x(x), m_y(y) {}

        // Get a point.
        __host__ __device__ __forceinline__ float2 get_point(int idx) const
        {
            return make_float2(m_x[idx], m_y[idx]);
        }

        // Set a point.
        __host__ __device__ __forceinline__ void set_point(int idx, const float2 &p)
        {
            m_x[idx] = p.x;
            m_y[idx] = p.y;
        }

        // Set the pointers.
        __host__ __device__ __forceinline__ void set(float *x, float *y)
        {
            m_x = x;
            m_y = y;
        }
};

////////////////////////////////////////////////////////////////////////////////
// A 2D bounding box
////////////////////////////////////////////////////////////////////////////////
class Bounding_box
{
        // Extreme points of the bounding box.
        float2 m_p_min;
        float2 m_p_max;

    public:
        // Constructor. Create a unit box.
        __host__ __device__ Bounding_box()
        {
            m_p_min = make_float2(0.0f, 0.0f);
            m_p_max = make_float2(1.0f, 1.0f);
        }

        // Compute the center of the bounding-box.
        __host__ __device__ void compute_center(float2 &center) const
        {
            center.x = 0.5f * (m_p_min.x + m_p_max.x);
            center.y = 0.5f * (m_p_min.y + m_p_max.y);
        }

        // The points of the box.
        __host__ __device__ __forceinline__ const float2 &get_max() const
        {
            return m_p_max;
        }

        __host__ __device__ __forceinline__ const float2 &get_min() const
        {
            return m_p_min;
        }

        // Does a box contain a point.
        __host__ __device__ bool contains(const float2 &p) const
        {
            return p.x >= m_p_min.x && p.x < m_p_max.x && p.y >= m_p_min.y && p.y < m_p_max.y;
        }

        // Define the bounding box.
        __host__ __device__ void set(float min_x, float min_y, float max_x, float max_y)
        {
            m_p_min.x = min_x;
            m_p_min.y = min_y;
            m_p_max.x = max_x;
            m_p_max.y = max_y;
        }
};

////////////////////////////////////////////////////////////////////////////////
// A node of a quadree.
////////////////////////////////////////////////////////////////////////////////
class Quadtree_node
{
        // The identifier of the node.
        int m_id;
        // The bounding box of the tree.
        Bounding_box m_bounding_box;
        // The range of points.
        int m_begin, m_end;


    public:
        // Constructor.
        __host__ __device__ Quadtree_node() : m_id(0), m_begin(0), m_end(0)
        {}

        // The ID of a node at its level.
        __host__ __device__ int id() const
        {
            return m_id;
        }

        // The ID of a node at its level.
        __host__ __device__ void set_id(int new_id)
        {
            m_id = new_id;
        }

        // The bounding box.
        __host__ __device__ __forceinline__ const Bounding_box &bounding_box() const
        {
            return m_bounding_box;
        }

        // Set the bounding box.
        __host__ __device__ __forceinline__ void set_bounding_box(float min_x, float min_y, float max_x, float max_y)
        {
            m_bounding_box.set(min_x, min_y, max_x, max_y);
        }

        // The number of points in the tree.
        __host__ __device__ __forceinline__ int num_points() const
        {
            return m_end - m_begin;
        }

        // The range of points in the tree.
        __host__ __device__ __forceinline__ int points_begin() const
        {
            return m_begin;
        }

        __host__ __device__ __forceinline__ int points_end() const
        {
            return m_end;
        }

        // Define the range for that node.
        __host__ __device__ __forceinline__ void set_range(int begin, int end)
        {
            m_begin = begin;
            m_end = end;
        }
};

////////////////////////////////////////////////////////////////////////////////
// Algorithm parameters.
////////////////////////////////////////////////////////////////////////////////
struct Parameters
{
    // Choose the right set of points to use as in/out.
    int point_selector;
    // The number of nodes at a given level (2^k for level k).
    int num_nodes_at_this_level;
    // The recursion depth.
    int depth;
    // The max value for depth.
    const int max_depth;
    // The minimum number of points in a node to stop recursion.
    const int min_points_per_node;

    // Constructor set to default values.
    __host__ __device__ Parameters(int max_depth, int min_points_per_node) :
        point_selector(0),
        num_nodes_at_this_level(1),
        depth(0),
        max_depth(max_depth),
        min_points_per_node(min_points_per_node)
    {}

    // Copy constructor. Changes the values for next iteration.
    __host__ __device__ Parameters(const Parameters &params, bool) :
        point_selector((params.point_selector+1) % 2),
        num_nodes_at_this_level(4*params.num_nodes_at_this_level),
        depth(params.depth+1),
        max_depth(params.max_depth),
        min_points_per_node(params.min_points_per_node)
    {}
};

////////////////////////////////////////////////////////////////////////////////
// Build a quadtree on the GPU. Use CUDA Dynamic Parallelism.
//
// The algorithm works as follows. The host (CPU) launches one block of
// NUM_THREADS_PER_BLOCK threads. That block will do the following steps:
//
// 1- Check the number of points and its depth.
//
// We impose a maximum depth to the tree and a minimum number of points per
// node. If the maximum depth is exceeded or the minimum number of points is
// reached. The threads in the block exit.
//
// Before exiting, they perform a buffer swap if it is needed. Indeed, the
// algorithm uses two buffers to permute the points and make sure they are
// properly distributed in the quadtree. By design we want all points to be
// in the first buffer of points at the end of the algorithm. It is the reason
// why we may have to swap the buffer before leavin (if the points are in the
// 2nd buffer).
//
// 2- Count the number of points in each child.
//
// If the depth is not too high and the number of points is sufficient, the
// block has to dispatch the points into four geometrical buckets: Its
// children. For that purpose, we compute the center of the bounding box and
// count the number of points in each quadrant.
//
// The set of points is divided into sections. Each section is given to a
// warp of threads (32 threads). Warps use __ballot and __popc intrinsics
// to count the points. See the Programming Guide for more information about
// those functions.
//
// 3- Scan the warps' results to know the "global" numbers.
//
// Warps work independently from each other. At the end, each warp knows the
// number of points in its section. To know the numbers for the block, the
// block has to run a scan/reduce at the block level. It's a traditional
// approach. The implementation in that sample is not as optimized as what
// could be found in fast radix sorts, for example, but it relies on the same
// idea.
//
// 4- Move points.
//
// Now that the block knows how many points go in each of its 4 children, it
// remains to dispatch the points. It is straightforward.
//
// 5- Launch new blocks.
//
// The block launches four new blocks: One per children. Each of the four blocks
// will apply the same algorithm.
////////////////////////////////////////////////////////////////////////////////
template< int NUM_THREADS_PER_BLOCK >
__global__
void build_quadtree_kernel(Quadtree_node *nodes, Points *points, Parameters params)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // The number of warps in a block.
    const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warpSize;

    // Shared memory to store the number of points.
    extern __shared__ int smem[];

    // s_num_pts[4][NUM_WARPS_PER_BLOCK];
    // Addresses of shared memory.
    volatile int *s_num_pts[4];

    for (int i = 0 ; i < 4 ; ++i)
        s_num_pts[i] = (volatile int *) &smem[i*NUM_WARPS_PER_BLOCK];

    // Compute the coordinates of the threads in the block.
    const int warp_id = threadIdx.x / warpSize;
    const int lane_id = threadIdx.x % warpSize;

    // Mask for compaction.
    int lane_mask_lt = (1 << lane_id) - 1; // Same as: asm( "mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt) );

    // The current node.
    Quadtree_node &node = nodes[blockIdx.x];
    node.set_id(node.id() + blockIdx.x);

    // The number of points in the node.
    int num_points = node.num_points();

    float2 center;
    int range_begin, range_end;
    int warp_cnts[4] = {0, 0, 0, 0};
    //
    // 1- Check the number of points and its depth.
    //

    // Stop the recursion here. Make sure points[0] contains all the points.
    if (params.depth >= params.max_depth || num_points <= params.min_points_per_node)
    {
        if (params.point_selector == 1)
        {
            int it = node.points_begin(), end = node.points_end();

            for (it += threadIdx.x ; it < end ; it += NUM_THREADS_PER_BLOCK)
                if (it < end)
                    points[0].set_point(it, points[1].get_point(it));
        }

        return;
    }

    // Compute the center of the bounding box of the points.
    const Bounding_box &bbox = node.bounding_box();

    bbox.compute_center(center);

    // Find how many points to give to each warp.
    int num_points_per_warp = max(warpSize, (num_points + NUM_WARPS_PER_BLOCK-1) / NUM_WARPS_PER_BLOCK);

    // Each warp of threads will compute the number of points to move to each quadrant.
    range_begin = node.points_begin() + warp_id * num_points_per_warp;
    range_end   = min(range_begin + num_points_per_warp, node.points_end());

    //
    // 2- Count the number of points in each child.
    //

    // Input points.
    const Points &in_points = points[params.point_selector];

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
    // Compute the number of points.
    for (int range_it = range_begin + tile32.thread_rank() ; tile32.any(range_it < range_end) ; range_it += warpSize)
    {
        // Is it still an active thread?
        bool is_active = range_it < range_end;

        // Load the coordinates of the point.
        float2 p = is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f);

        // Count top-left points.
        int num_pts = __popc(tile32.ballot(is_active && p.x < center.x && p.y >= center.y));
        warp_cnts[0] += tile32.shfl(num_pts, 0);

        // Count top-right points.
        num_pts = __popc(tile32.ballot(is_active && p.x >= center.x && p.y >= center.y));
        warp_cnts[1] += tile32.shfl(num_pts, 0);

        // Count bottom-left points.
        num_pts = __popc(tile32.ballot(is_active && p.x < center.x && p.y < center.y));
        warp_cnts[2] += tile32.shfl(num_pts, 0);

        // Count bottom-right points.
        num_pts = __popc(tile32.ballot(is_active && p.x >= center.x && p.y < center.y));
        warp_cnts[3] += tile32.shfl(num_pts, 0);
    }

    if (tile32.thread_rank() == 0)
    {
        s_num_pts[0][warp_id] = warp_cnts[0];
        s_num_pts[1][warp_id] = warp_cnts[1];
        s_num_pts[2][warp_id] = warp_cnts[2];
        s_num_pts[3][warp_id] = warp_cnts[3];
    }

    // Make sure warps have finished counting.
    cg::sync(cta);

    //
    // 3- Scan the warps' results to know the "global" numbers.
    //

    // First 4 warps scan the numbers of points per child (inclusive scan).
    if (warp_id < 4)
    {
        int num_pts = tile32.thread_rank() < NUM_WARPS_PER_BLOCK ? s_num_pts[warp_id][tile32.thread_rank()] : 0;
#pragma unroll

        for (int offset = 1 ; offset < NUM_WARPS_PER_BLOCK ; offset *= 2)
        {
            int n = tile32.shfl_up(num_pts, offset);

            if (tile32.thread_rank() >= offset)
                num_pts += n;
        }

        if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK)
            s_num_pts[warp_id][tile32.thread_rank()] = num_pts;
    }

    cg::sync(cta);

    // Compute global offsets.
    if (warp_id == 0)
    {
        int sum = s_num_pts[0][NUM_WARPS_PER_BLOCK-1];

        for (int row = 1 ; row < 4 ; ++row)
        {
            int tmp = s_num_pts[row][NUM_WARPS_PER_BLOCK-1];
            cg::sync(tile32);

            if (tile32.thread_rank() < NUM_WARPS_PER_BLOCK)
                s_num_pts[row][tile32.thread_rank()] += sum;

            cg::sync(tile32);
            sum += tmp;
        }
    }

    cg::sync(cta);

    // Make the scan exclusive.
    int val = 0;
    if (threadIdx.x < 4*NUM_WARPS_PER_BLOCK)
    {
        val = threadIdx.x == 0 ? 0 : smem[threadIdx.x-1];
        val += node.points_begin();
    }

    cg::sync(cta);

    if (threadIdx.x < 4*NUM_WARPS_PER_BLOCK)
    {
        smem[threadIdx.x] = val;
    }

    cg::sync(cta);

    //
    // 4- Move points.
    //
    if (!(params.depth >= params.max_depth || num_points <= params.min_points_per_node))
    {
        // Output points.
        Points &out_points = points[(params.point_selector+1) % 2];

        warp_cnts[0] = s_num_pts[0][warp_id];
        warp_cnts[1] = s_num_pts[1][warp_id];
        warp_cnts[2] = s_num_pts[2][warp_id];
        warp_cnts[3] = s_num_pts[3][warp_id];

        const Points &in_points = points[params.point_selector];
        // Reorder points.
        for (int range_it = range_begin + tile32.thread_rank(); tile32.any(range_it < range_end) ; range_it += warpSize)
        {
            // Is it still an active thread?
            bool is_active = range_it < range_end;

            // Load the coordinates of the point.
            float2 p = is_active ? in_points.get_point(range_it) : make_float2(0.0f, 0.0f);

            // Count top-left points.
            bool pred = is_active && p.x < center.x && p.y >= center.y;
            int vote = tile32.ballot(pred);
            int dest = warp_cnts[0] + __popc(vote & lane_mask_lt);

            if (pred)
                out_points.set_point(dest, p);

            warp_cnts[0] += tile32.shfl(__popc(vote), 0);

            // Count top-right points.
            pred = is_active && p.x >= center.x && p.y >= center.y;
            vote = tile32.ballot(pred);
            dest = warp_cnts[1] + __popc(vote & lane_mask_lt);

            if (pred)
                out_points.set_point(dest, p);

            warp_cnts[1] += tile32.shfl(__popc(vote), 0);

            // Count bottom-left points.
            pred = is_active && p.x < center.x && p.y < center.y;
            vote = tile32.ballot(pred);
            dest =  warp_cnts[2] + __popc(vote & lane_mask_lt);

            if (pred)
                out_points.set_point(dest, p);

            warp_cnts[2] +=  tile32.shfl(__popc(vote), 0);

            // Count bottom-right points.
            pred = is_active && p.x >= center.x && p.y < center.y;
            vote = tile32.ballot(pred);
            dest = warp_cnts[3] + __popc(vote & lane_mask_lt);

            if (pred)
                out_points.set_point(dest, p);

            warp_cnts[3] += tile32.shfl(__popc(vote), 0);
        }
    }

    cg::sync(cta);

    if (tile32.thread_rank() == 0)
    {
        s_num_pts[0][warp_id] = warp_cnts[0];
        s_num_pts[1][warp_id] = warp_cnts[1] ;
        s_num_pts[2][warp_id] = warp_cnts[2] ;
        s_num_pts[3][warp_id] = warp_cnts[3];
    }

    cg::sync(cta);

    //
    // 5- Launch new blocks.
    //
    if (!(params.depth >= params.max_depth || num_points <= params.min_points_per_node))
    {
        // The last thread launches new blocks.
        if (threadIdx.x == NUM_THREADS_PER_BLOCK-1 )
        {
            // The children.
            Quadtree_node *children = &nodes[params.num_nodes_at_this_level];

            // The offsets of the children at their level.
            int child_offset = 4*node.id();

            // Set IDs.
            children[child_offset+0].set_id(4*node.id()+ 0);
            children[child_offset+1].set_id(4*node.id()+ 4);
            children[child_offset+2].set_id(4*node.id()+ 8);
            children[child_offset+3].set_id(4*node.id()+12);

            const Bounding_box &bbox = node.bounding_box();
            // Points of the bounding-box.
            const float2 &p_min = bbox.get_min();
            const float2 &p_max = bbox.get_max();

            // Set the bounding boxes of the children.
            children[child_offset+0].set_bounding_box(p_min.x , center.y, center.x, p_max.y);    // Top-left.
            children[child_offset+1].set_bounding_box(center.x, center.y, p_max.x , p_max.y);    // Top-right.
            children[child_offset+2].set_bounding_box(p_min.x , p_min.y , center.x, center.y);   // Bottom-left.
            children[child_offset+3].set_bounding_box(center.x, p_min.y , p_max.x , center.y);   // Bottom-right.

            // Set the ranges of the children.

            children[child_offset+0].set_range(node.points_begin(),   s_num_pts[0][warp_id]);
            children[child_offset+1].set_range(s_num_pts[0][warp_id], s_num_pts[1][warp_id]);
            children[child_offset+2].set_range(s_num_pts[1][warp_id], s_num_pts[2][warp_id]);
            children[child_offset+3].set_range(s_num_pts[2][warp_id], s_num_pts[3][warp_id]);

            // Launch 4 children.
            build_quadtree_kernel<NUM_THREADS_PER_BLOCK><<<4, NUM_THREADS_PER_BLOCK, 4 *NUM_WARPS_PER_BLOCK *sizeof(int)>>>(children, points, Parameters(params, true));
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Make sure a Quadtree is properly defined.
////////////////////////////////////////////////////////////////////////////////
bool check_quadtree(const Quadtree_node *nodes, int idx, int num_pts, Points *pts, Parameters params)
{
    const Quadtree_node &node = nodes[idx];
    int num_points = node.num_points();

    if (params.depth == params.max_depth || num_points <= params.min_points_per_node)
    {
        int num_points_in_children = 0;

        num_points_in_children += nodes[params.num_nodes_at_this_level + 4*idx+0].num_points();
        num_points_in_children += nodes[params.num_nodes_at_this_level + 4*idx+1].num_points();
        num_points_in_children += nodes[params.num_nodes_at_this_level + 4*idx+2].num_points();
        num_points_in_children += nodes[params.num_nodes_at_this_level + 4*idx+3].num_points();

        if (num_points_in_children != node.num_points())
            return false;

        return check_quadtree(&nodes[params.num_nodes_at_this_level], 4*idx+0, num_pts, pts, Parameters(params, true)) &&
               check_quadtree(&nodes[params.num_nodes_at_this_level], 4*idx+1, num_pts, pts, Parameters(params, true)) &&
               check_quadtree(&nodes[params.num_nodes_at_this_level], 4*idx+2, num_pts, pts, Parameters(params, true)) &&
               check_quadtree(&nodes[params.num_nodes_at_this_level], 4*idx+3, num_pts, pts, Parameters(params, true));
    }

    const Bounding_box &bbox = node.bounding_box();

    for (int it = node.points_begin() ; it < node.points_end() ; ++it)
    {
        if (it >= num_pts)
            return false;

        float2 p = pts->get_point(it);

        if (!bbox.contains(p))
            return false;
    }

    return true;
}

////////////////////////////////////////////////////////////////////////////////
// Parallel random number generator.
////////////////////////////////////////////////////////////////////////////////
struct Random_generator
{
  int count;

  __host__ __device__
  Random_generator() : count(0) {}
    __host__ __device__ unsigned int hash(unsigned int a)
    {
        a = (a+0x7ed55d16) + (a<<12);
        a = (a^0xc761c23c) ^ (a>>19);
        a = (a+0x165667b1) + (a<<5);
        a = (a+0xd3a2646c) ^ (a<<9);
        a = (a+0xfd7046c5) + (a<<3);
        a = (a^0xb55a4f09) ^ (a>>16);
        return a;
    }

    __host__ __device__ __forceinline__ thrust::tuple<float, float> operator()()
    {
#ifdef __CUDA_ARCH__
        unsigned seed = hash(blockIdx.x*blockDim.x + threadIdx.x + count);
        // thrust::generate may call operator() more than once per thread.
        // Hence, increment count by grid size to ensure uniqueness of seed
        count += blockDim.x * gridDim.x;
#else
        unsigned seed = hash(0);
#endif
        thrust::default_random_engine rng(seed);
        thrust::random::uniform_real_distribution<float> distrib;
        return thrust::make_tuple(distrib(rng), distrib(rng));
    }
};

////////////////////////////////////////////////////////////////////////////////
// Allocate GPU structs, launch kernel and clean up
////////////////////////////////////////////////////////////////////////////////
bool cdpQuadtree(int warp_size)
{
    // Constants to control the algorithm.
    const int num_points = 1024;
    const int max_depth  = 8;
    const int min_points_per_node = 16;

    // Allocate memory for points.
    thrust::device_vector<float> x_d0(num_points);
    thrust::device_vector<float> x_d1(num_points);
    thrust::device_vector<float> y_d0(num_points);
    thrust::device_vector<float> y_d1(num_points);

    // Generate random points.
    Random_generator rnd;
    thrust::generate(
        thrust::make_zip_iterator(thrust::make_tuple(x_d0.begin(), y_d0.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(x_d0.end(), y_d0.end())),
        rnd);

    // Host structures to analyze the device ones.
    Points points_init[2];
    points_init[0].set(thrust::raw_pointer_cast(&x_d0[0]), thrust::raw_pointer_cast(&y_d0[0]));
    points_init[1].set(thrust::raw_pointer_cast(&x_d1[0]), thrust::raw_pointer_cast(&y_d1[0]));

    // Allocate memory to store points.
    Points *points;
    checkCudaErrors(cudaMalloc((void **) &points, 2*sizeof(Points)));
    checkCudaErrors(cudaMemcpy(points, points_init, 2*sizeof(Points), cudaMemcpyHostToDevice));

    // We could use a close form...
    int max_nodes = 0;

    for (int i = 0, num_nodes_at_level = 1 ; i < max_depth ; ++i, num_nodes_at_level *= 4)
        max_nodes += num_nodes_at_level;

    // Allocate memory to store the tree.
    Quadtree_node root;
    root.set_range(0, num_points);
    Quadtree_node *nodes;
    checkCudaErrors(cudaMalloc((void **) &nodes, max_nodes*sizeof(Quadtree_node)));
    checkCudaErrors(cudaMemcpy(nodes, &root, sizeof(Quadtree_node), cudaMemcpyHostToDevice));

    // We set the recursion limit for CDP to max_depth.
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);

    // Build the quadtree.
    Parameters params(max_depth, min_points_per_node);
    std::cout << "Launching CDP kernel to build the quadtree" << std::endl;
    const int NUM_THREADS_PER_BLOCK = 128; // Do not use less than 128 threads.
    const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warp_size;
    const size_t smem_size = 4*NUM_WARPS_PER_BLOCK*sizeof(int);
    build_quadtree_kernel<NUM_THREADS_PER_BLOCK><<<1, NUM_THREADS_PER_BLOCK, smem_size>>>(nodes, points, params);
    checkCudaErrors(cudaGetLastError());

    // Copy points to CPU.
    thrust::host_vector<float> x_h(x_d0);
    thrust::host_vector<float> y_h(y_d0);
    Points host_points;
    host_points.set(thrust::raw_pointer_cast(&x_h[0]), thrust::raw_pointer_cast(&y_h[0]));

    // Copy nodes to CPU.
    Quadtree_node *host_nodes = new Quadtree_node[max_nodes];
    checkCudaErrors(cudaMemcpy(host_nodes, nodes, max_nodes *sizeof(Quadtree_node), cudaMemcpyDeviceToHost));

    // Validate the results.
    bool ok = check_quadtree(host_nodes, 0, num_points, &host_points, params);
    std::cout << "Results: " << (ok ? "OK" : "FAILED") << std::endl;

    // Free CPU memory.
    delete[] host_nodes;

    // Free memory.
    checkCudaErrors(cudaFree(nodes));
    checkCudaErrors(cudaFree(points));

    return ok;
}

////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // Find/set the device.
    // The test requires an architecture SM35 or greater (CDP capable).
    int cuda_device = findCudaDevice(argc, (const char **)argv);
    cudaDeviceProp deviceProps;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, cuda_device));
    int cdpCapable = (deviceProps.major == 3 && deviceProps.minor >= 5) || deviceProps.major >=4;

    printf("GPU device %s has compute capabilities (SM %d.%d)\n", deviceProps.name, deviceProps.major, deviceProps.minor);

    if (!cdpCapable)
    {
        std::cerr << "cdpQuadTree requires SM 3.5 or higher to use CUDA Dynamic Parallelism.  Exiting...\n" << std::endl;
        exit(EXIT_WAIVED);
    }

    bool ok = cdpQuadtree(deviceProps.warpSize);

    return (ok ? EXIT_SUCCESS : EXIT_FAILURE);
}


/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

extern "C" __global__ void kernelFunction(int *input)
{
    input[threadIdx.x] = 32 - threadIdx.x;
}
////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

// ----------------------------------------------------------------------------------------
// Transpose
//
// This file contains both device and host code for transposing a floating-point
// matrix.  It performs several transpose kernels, which incrementally improve performance
// through coalescing, removing shared memory bank conflicts, and eliminating partition
// camping.  Several of the kernels perform a copy, used to represent the best case
// performance that a transpose can achieve.
//
// Please see the whitepaper in the docs folder of the transpose project for a detailed
// description of this performance study.
// ----------------------------------------------------------------------------------------

#include <cooperative_groups.h>

namespace cg = cooperative_groups;
// Utilities and system includes
#include <helper_string.h>    // helper for string parsing
#include <helper_image.h>     // helper for image and data comparison
#include <helper_cuda.h>      // helper for cuda error checking functions

const char *sSDKsample = "Transpose";

// Each block transposes/copies a tile of TILE_DIM x TILE_DIM elements
// using TILE_DIM x BLOCK_ROWS threads, so that each thread transposes
// TILE_DIM/BLOCK_ROWS elements.  TILE_DIM must be an integral multiple of BLOCK_ROWS

#define TILE_DIM    16
#define BLOCK_ROWS  16

// This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
int MATRIX_SIZE_X = 1024;
int MATRIX_SIZE_Y = 1024;
int MUL_FACTOR    = TILE_DIM;

#define FLOOR(a,b) (a-(a%b))

// Compute the tile size necessary to illustrate performance cases for SM20+ hardware
int MAX_TILES = (FLOOR(MATRIX_SIZE_X,512) * FLOOR(MATRIX_SIZE_Y,512)) / (TILE_DIM *TILE_DIM);

// Number of repetitions used for timing.  Two sets of repetitions are performed:
// 1) over kernel launches and 2) inside the kernel over just the loads and stores

#define NUM_REPS  100

// -------------------------------------------------------
// Copies
// width and height must be integral multiples of TILE_DIM
// -------------------------------------------------------

__global__ void copy(float *odata, float *idata, int width, int height)
{
    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;

    int index  = xIndex + width*yIndex;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        odata[index+i*width] = idata[index+i*width];
    }

}

__global__ void copySharedMem(float *odata, float *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float tile[TILE_DIM][TILE_DIM];

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;

    int index  = xIndex + width*yIndex;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        if (xIndex < width && yIndex < height)
        {
            tile[threadIdx.y][threadIdx.x] = idata[index];
        }
    }

    cg::sync(cta);

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        if (xIndex < height && yIndex < width)
        {
            odata[index] = tile[threadIdx.y][threadIdx.x];
        }
    }
}

// -------------------------------------------------------
// Transposes
// width and height must be integral multiples of TILE_DIM
// -------------------------------------------------------

__global__ void transposeNaive(float *odata, float *idata, int width, int height)
{
    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;

    int index_in  = xIndex + width * yIndex;
    int index_out = yIndex + height * xIndex;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        odata[index_out+i] = idata[index_in+i*width];
    }
}

// coalesced transpose (with bank conflicts)

__global__ void transposeCoalesced(float *odata, float *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float tile[TILE_DIM][TILE_DIM];

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)*width;

    xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)*height;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
    }

    cg::sync(cta);

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
    }
}

// Coalesced transpose with no bank conflicts

__global__ void transposeNoBankConflicts(float *odata, float *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float tile[TILE_DIM][TILE_DIM+1];

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)*width;

    xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)*height;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
    }

    cg::sync(cta);

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
    }
}

// Transpose that effectively reorders execution of thread blocks along diagonals of the
// matrix (also coalesced and has no bank conflicts)
//
// Here blockIdx.x is interpreted as the distance along a diagonal and blockIdx.y as
// corresponding to different diagonals
//
// blockIdx_x and blockIdx_y expressions map the diagonal coordinates to the more commonly
// used cartesian coordinates so that the only changes to the code from the coalesced version
// are the calculation of the blockIdx_x and blockIdx_y and replacement of blockIdx.x and
// bloclIdx.y with the subscripted versions in the remaining code

__global__ void transposeDiagonal(float *odata, float *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float tile[TILE_DIM][TILE_DIM+1];

    int blockIdx_x, blockIdx_y;

    // do diagonal reordering
    if (width == height)
    {
        blockIdx_y = blockIdx.x;
        blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
    }
    else
    {
        int bid = blockIdx.x + gridDim.x*blockIdx.y;
        blockIdx_y = bid%gridDim.y;
        blockIdx_x = ((bid/gridDim.y)+blockIdx_y)%gridDim.x;
    }

    // from here on the code is same as previous kernel except blockIdx_x replaces blockIdx.x
    // and similarly for y

    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)*width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)*height;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
    }

    cg::sync(cta);

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS)
    {
        odata[index_out+i*height] = tile[threadIdx.x][threadIdx.y+i];
    }
}

// --------------------------------------------------------------------
// Partial transposes
// NB: the coarse- and fine-grained routines only perform part of a
//     transpose and will fail the test against the reference solution
//
//     They are used to assess performance characteristics of different
//     components of a full transpose
// --------------------------------------------------------------------

__global__ void transposeFineGrained(float *odata, float *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float block[TILE_DIM][TILE_DIM+1];

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int index = xIndex + (yIndex)*width;

    for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
    {
        block[threadIdx.y+i][threadIdx.x] = idata[index+i*width];
    }

    cg::sync(cta);

    for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
    {
        odata[index+i*height] = block[threadIdx.x][threadIdx.y+i];
    }
}


__global__ void transposeCoarseGrained(float *odata, float *idata, int width, int height)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float block[TILE_DIM][TILE_DIM+1];

    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)*width;

    xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)*height;

    for (int i=0; i<TILE_DIM; i += BLOCK_ROWS)
    {
        block[threadIdx.y+i][threadIdx.x] = idata[index_in+i*width];
    }

    cg::sync(cta);

    for (int i=0; i<TILE_DIM; i += BLOCK_ROWS)
    {
        odata[index_out+i*height] = block[threadIdx.y+i][threadIdx.x];
    }
}


// ---------------------
// host utility routines
// ---------------------

void computeTransposeGold(float *gold, float *idata,
                          const  int size_x, const  int size_y)
{
    for (int y = 0; y < size_y; ++y)
    {
        for (int x = 0; x < size_x; ++x)
        {
            gold[(x * size_y) + y] = idata[(y * size_x) + x];
        }
    }
}


void getParams(int argc, char **argv, cudaDeviceProp &deviceProp, int &size_x, int &size_y, int max_tile_dim)
{
    // set matrix size (if (x,y) dim of matrix is not square, then this will have to be modified
    if (checkCmdLineFlag(argc, (const char **)argv, "dimX"))
    {
        size_x = getCmdLineArgumentInt(argc, (const char **) argv, "dimX");

        if (size_x > max_tile_dim)
        {
            printf("> MatrixSize X = %d is greater than the recommended size = %d\n", size_x, max_tile_dim);
        }
        else
        {
            printf("> MatrixSize X = %d\n", size_x);
        }
    }
    else
    {
        size_x = max_tile_dim;
        size_x = FLOOR(size_x, 512);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "dimY"))
    {
        size_y = getCmdLineArgumentInt(argc, (const char **) argv, "dimY");

        if (size_y > max_tile_dim)
        {
            printf("> MatrixSize Y = %d is greater than the recommended size = %d\n", size_y, max_tile_dim);
        }
        else
        {
            printf("> MatrixSize Y = %d\n", size_y);
        }
    }
    else
    {
        size_y = max_tile_dim;
        size_y = FLOOR(size_y, 512);
    }
}


void
showHelp()
{
    printf("\n%s : Command line options\n", sSDKsample);
    printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
    printf("> The default matrix size can be overridden with these parameters\n");
    printf("\t-dimX=row_dim_size (matrix row    dimensions)\n");
    printf("\t-dimY=col_dim_size (matrix column dimensions)\n");
}


// ----
// main
// ----

int
main(int argc, char **argv)
{
    // Start logs
    printf("%s Starting...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        showHelp();
        return 0;
    }

    int devID = findCudaDevice(argc, (const char **)argv);
    cudaDeviceProp deviceProp;

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDevice(&devID));
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    // compute the scaling factor (for GPUs with fewer MPs)
    float scale_factor, total_tiles;
    scale_factor = max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);

    printf("> Device %d: \"%s\"\n", devID, deviceProp.name);
    printf("> SM Capability %d.%d detected:\n", deviceProp.major, deviceProp.minor);

    // Calculate number of tiles we will run for the Matrix Transpose performance tests
    int size_x, size_y, max_matrix_dim, matrix_size_test;

    matrix_size_test = 512;  // we round down max_matrix_dim for this perf test
    total_tiles = (float)MAX_TILES / scale_factor;

    max_matrix_dim = FLOOR((int)(floor(sqrt(total_tiles))* TILE_DIM), matrix_size_test);

    // This is the minimum size allowed
    if (max_matrix_dim == 0)
    {
        max_matrix_dim = matrix_size_test;
    }

    printf("> [%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
           deviceProp.name, deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

    printf("> Compute performance scaling factor = %4.2f\n", scale_factor);

    // Extract parameters if there are any, command line -dimx and -dimy can override
    // any of these settings
    getParams(argc, argv, deviceProp, size_x, size_y, max_matrix_dim);

    if (size_x != size_y)
    {
        printf("\n[%s] does not support non-square matrices (row_dim_size(%d) != col_dim_size(%d))\nExiting...\n\n", sSDKsample, size_x, size_y);
        exit(EXIT_FAILURE);
    }

    if (size_x%TILE_DIM != 0 || size_y%TILE_DIM != 0)
    {
        printf("[%s] Matrix size must be integral multiple of tile size\nExiting...\n\n", sSDKsample);
        exit(EXIT_FAILURE);
    }

    // kernel pointer and descriptor
    void (*kernel)(float *, float *, int, int);
    const char *kernelName;

    // execution configuration parameters
    dim3 grid(size_x/TILE_DIM, size_y/TILE_DIM), threads(TILE_DIM,BLOCK_ROWS);

    if (grid.x < 1 || grid.y < 1)
    {
        printf("[%s] grid size computation incorrect in test \nExiting...\n\n", sSDKsample);
        exit(EXIT_FAILURE);
    }

    // CUDA events
    cudaEvent_t start, stop;

    // size of memory required to store the matrix
    size_t mem_size = static_cast<size_t>(sizeof(float) * size_x*size_y);

    if (2*mem_size > deviceProp.totalGlobalMem)
    {
        printf("Input matrix size is larger than the available device memory!\n");
        printf("Please choose a smaller size matrix\n");
        exit(EXIT_FAILURE);
    }

    // allocate host memory
    float *h_idata = (float *) malloc(mem_size);
    float *h_odata = (float *) malloc(mem_size);
    float *transposeGold = (float *) malloc(mem_size);
    float *gold;

    // allocate device memory
    float *d_idata, *d_odata;
    checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
    checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));

    // initialize host data
    for (int i = 0; i < (size_x*size_y); ++i)
    {
        h_idata[i] = (float) i;
    }

    // copy host data to device
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));

    // Compute reference transpose solution
    computeTransposeGold(transposeGold, h_idata, size_x, size_y);

    // print out common data for all kernels
    printf("\nMatrix size: %dx%d (%dx%d tiles), tile size: %dx%d, block size: %dx%d\n\n",
           size_x, size_y, size_x/TILE_DIM, size_y/TILE_DIM, TILE_DIM, TILE_DIM, TILE_DIM, BLOCK_ROWS);

    // initialize events
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    //
    // loop over different kernels
    //

    bool success = true;

    for (int k = 0; k<8; k++)
    {
        // set kernel pointer
        switch (k)
        {
            case 0:
                kernel = &copy;
                kernelName = "simple copy       ";
                break;

            case 1:
                kernel = &copySharedMem;
                kernelName = "shared memory copy";
                break;

            case 2:
                kernel = &transposeNaive;
                kernelName = "naive             ";
                break;

            case 3:
                kernel = &transposeCoalesced;
                kernelName = "coalesced         ";
                break;

            case 4:
                kernel = &transposeNoBankConflicts;
                kernelName = "optimized         ";
                break;

            case 5:
                kernel = &transposeCoarseGrained;
                kernelName = "coarse-grained    ";
                break;

            case 6:
                kernel = &transposeFineGrained;
                kernelName = "fine-grained      ";
                break;

            case 7:
                kernel = &transposeDiagonal;
                kernelName = "diagonal          ";
                break;
        }

        // set reference solution
        if (kernel == &copy || kernel == &copySharedMem)
        {
            gold = h_idata;
        }
        else if (kernel == &transposeCoarseGrained || kernel == &transposeFineGrained)
        {
            gold = h_odata;   // fine- and coarse-grained kernels are not full transposes, so bypass check
        }
        else
        {
            gold = transposeGold;
        }

        // Clear error status
        checkCudaErrors(cudaGetLastError());

        // warmup to avoid timing startup
        kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y);

        // take measurements for loop over kernel launches
        checkCudaErrors(cudaEventRecord(start, 0));

        for (int i=0; i < NUM_REPS; i++)
        {
            kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y);
            // Ensure no launch failure
            checkCudaErrors(cudaGetLastError());
        }

        checkCudaErrors(cudaEventRecord(stop, 0));
        checkCudaErrors(cudaEventSynchronize(stop));
        float kernelTime;
        checkCudaErrors(cudaEventElapsedTime(&kernelTime, start, stop));

        checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost));
        bool res = compareData(gold, h_odata, size_x*size_y, 0.01f, 0.0f);

        if (res == false)
        {
            printf("*** %s kernel FAILED ***\n", kernelName);
            success = false;
        }

        // take measurements for loop inside kernel
        checkCudaErrors(cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost));
        res = compareData(gold, h_odata, size_x*size_y, 0.01f, 0.0f);

        if (res == false)
        {
            printf("*** %s kernel FAILED ***\n", kernelName);
            success = false;
        }

        // report effective bandwidths
        float kernelBandwidth = 2.0f * 1000.0f * mem_size/(1024*1024*1024)/(kernelTime/NUM_REPS);
        printf("transpose %s, Throughput = %.4f GB/s, Time = %.5f ms, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
               kernelName,
               kernelBandwidth,
               kernelTime/NUM_REPS,
               (size_x *size_y), 1, TILE_DIM *BLOCK_ROWS);

    }

    // cleanup
    free(h_idata);
    free(h_odata);
    free(transposeGold);
    cudaFree(d_idata);
    cudaFree(d_odata);

    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));

    if (!success)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
// This is a basic, recursive bitonic sort taken from
// http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/oddn.htm
//
// The parallel code is based on:
// http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm
//
// The multithread code is from me.

#include <stdio.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include "cdpQuicksort.h"

// Inline PTX call to return index of highest non-zero bit in a word
static __device__ __forceinline__ unsigned int __btflo(unsigned int word)
{
    unsigned int ret;
    asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word));
    return ret;
}


////////////////////////////////////////////////////////////////////////////////
//
//  qcompare
//
//  Comparison function. Note difference from libc standard in
//  that we take by reference, not by pointer. I can't figure
//  out how to get a template-to-pointer specialisation working.
//  Perhaps it requires a class?
//
////////////////////////////////////////////////////////////////////////////////
__device__ __forceinline__ int qcompare(unsigned &val1, unsigned &val2)
{
    return (val1 > val2) ? 1 : (val1 == val2) ? 0 : -1;
}

////////////////////////////////////////////////////////////////////////////////
//
//  Basic any-N bitonic sort. We sort "len" elements of "indata", starting
//  from the "offset" elements into the input data array. Note that "outdata"
//  can safely be the same as "indata" for an in-place sort (we stage through
//  shared memory).
//
//  We handle non-power-of-2 sizes by padding out to the next largest power of 2.
//  This is the fully-generic version, for sorting arbitrary data which does not
//  have a clear "maximum" value. We track "invalid" entries in a separate array
//  to make sure that they always sorts as "max value" elements. A template
//  parameter "OOR" allows specialisation to optimise away the invalid tracking.
//
//  We can do a more specialised version for int/longlong/flat/double, in which
//  we pad out the array with max-value-of-type elements. That's another function.
//
//  The last step copies from indata -> outdata... the rest is done in-place.
//  We use shared memory as temporary storage, which puts an upper limit on
//  how much data we can sort per block.
//
////////////////////////////////////////////////////////////////////////////////
static __device__ __forceinline__ void bitonicsort_kernel(unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len, cg::thread_block cta)
{
    __shared__ unsigned sortbuf[1024];     // Max of 1024 elements - TODO: make this dynamic

    // First copy data into shared memory.
    unsigned int inside = (threadIdx.x < len);
    sortbuf[threadIdx.x] = inside ? indata[threadIdx.x + offset] : 0xffffffffu;
    cg::sync(cta);

    // Now the sort loops
    // Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)
    // and "j" is the partner element in the butterfly.
    // Two threads each work on one butterfly, because the read/write needs to happen
    // simultaneously
    for (unsigned int k=2; k<=blockDim.x; k*=2) // Butterfly stride increments in powers of 2
    {
        for (unsigned int j=k>>1; j>0; j>>=1) // Strides also in powers of to, up to <k
        {
            unsigned int swap_idx = threadIdx.x ^ j; // Index of element we're compare-and-swapping with
            unsigned my_elem = sortbuf[threadIdx.x];
            unsigned swap_elem = sortbuf[swap_idx];

            cg::sync(cta);

            // The k'th bit of my threadid (and hence my sort item ID)
            // determines if we sort ascending or descending.
            // However, since threads are reading from the top AND the bottom of
            // the butterfly, if my ID is > swap_idx, then ascending means mine<swap.
            // Finally, if either my_elem or swap_elem is out of range, then it
            // ALWAYS acts like it's the largest number.
            // Confusing? It saves us two writes though.
            unsigned int ascend = k * (swap_idx < threadIdx.x);
            unsigned int descend = k * (swap_idx > threadIdx.x);
            bool swap = false;

            if ((threadIdx.x & k) == ascend)
            {
                if (my_elem > swap_elem)
                    swap = true;
            }

            if ((threadIdx.x & k) == descend)
            {
                if (my_elem < swap_elem)
                    swap = true;
            }

            // If we had to swap, then write my data to the other element's position.
            // Don't forget to track out-of-range status too!
            if (swap)
            {
                sortbuf[swap_idx] = my_elem;
            }

            cg::sync(cta);
        }
    }

    // Copy the sorted data from shared memory back to the output buffer
    if (threadIdx.x < len)
        outdata[threadIdx.x + offset] = sortbuf[threadIdx.x];
}

//////////////////////////////////////////////////////////////////////////////////
//  This is an emergency-CTA sort, which sorts an arbitrary sized chunk
//  using a single block. Useful for if qsort runs out of nesting depth.
//
//  Note that bitonic sort needs enough storage to pad up to the nearest
//  power of 2. This means that the double-buffer is always large enough
//  (when combined with the main buffer), but we do not get enough space
//  to keep OOR information.
//
//  This in turn means that this sort does not work with a generic data
//  type. It must be a directly-comparable (i.e. with max value) type.
//
////////////////////////////////////////////////////////////////////////////////
static __device__ __forceinline__ void big_bitonicsort_kernel(unsigned *indata, unsigned *outdata, unsigned *backbuf, unsigned int offset, unsigned int len, cg::thread_block cta)
{
    unsigned int len2 = 1 << (__btflo(len-1U)+1);   // Round up len to nearest power-of-2

    if (threadIdx.x >= len2) return;                // Early out for case where more threads launched than there is data

    // First, set up our unused values to be the max data type.
    for (unsigned int i=len; i<len2; i+=blockDim.x)
    {
        unsigned int index = i + threadIdx.x;

        if (index < len2)
        {
            // Must split our index between two buffers
            if (index < len)
                indata[index+offset] = 0xffffffffu;
            else
                backbuf[index+offset-len] = 0xffffffffu;
        }
    }

    cg::sync(cta);

    // Now the sort loops
    // Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)
    // and "j" is the partner element in the butterfly.
    // Two threads each work on one butterfly, because the read/write needs to happen
    // simultaneously
    for (unsigned int k=2; k<=len2; k*=2) // Butterfly stride increments in powers of 2
    {
        for (unsigned int j=k>>1; j>0; j>>=1)   // Strides also in powers of to, up to <k
        {
            for (unsigned int i=0; i<len2; i+=blockDim.x)
            {
                unsigned int index = threadIdx.x + i;
                unsigned int swap_idx = index ^ j; // Index of element we're compare-and-swapping with

                // Only do the swap for index<swap_idx (avoids collision between other threads)
                if (swap_idx > index)
                {
                    unsigned my_elem, swap_elem;

                    if (index < len)
                        my_elem = indata[index+offset];
                    else
                        my_elem = backbuf[index+offset-len];

                    if (swap_idx < len)
                        swap_elem = indata[swap_idx+offset];
                    else
                        swap_elem = backbuf[swap_idx+offset-len];

                    // The k'th bit of my index (and hence my sort item ID)
                    // determines if we sort ascending or descending.
                    // Also, if either my_elem or swap_elem is out of range, then it
                    // ALWAYS acts like it's the largest number.
                    bool swap = false;

                    if ((index & k) == 0)
                    {
                        if (my_elem > swap_elem)
                            swap = true;
                    }

                    if ((index & k) == k)
                    {
                        if (my_elem < swap_elem)
                            swap = true;
                    }

                    // If we had to swap, then write my data to the other element's position.
                    if (swap)
                    {
                        if (swap_idx < len)
                            indata[swap_idx+offset] = my_elem;
                        else
                            backbuf[swap_idx+offset-len] = my_elem;

                        if (index < len)
                            indata[index+offset] = swap_elem;
                        else
                            backbuf[index+offset-len] = swap_elem;
                    }
                }
            }

            cg::sync(cta);    // Only need to sync for each "j" pass
        }
    }

    // Copy the sorted data from the input to the output buffer, because we sort in-place
    if (outdata != indata)
    {
        for (unsigned int i=0; i<len; i+=blockDim.x)
        {
            unsigned int index = i + threadIdx.x;

            if (index < len)
                outdata[index+offset] = indata[index+offset];
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// KERNELS
////////////////////////////////////////////////////////////////////////////////

__global__ void bitonicsort(unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    bitonicsort_kernel(indata, outdata, offset, len, cta);
}

__global__ void big_bitonicsort(unsigned *indata, unsigned *outdata, unsigned *backbuf, unsigned int offset, unsigned int len)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    big_bitonicsort_kernel(indata, outdata, backbuf, offset, len, cta);
}

////////////////////////////////////////////////////////////////////////////////

/**
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

////////////////////////////////////////////////////////////////////////////////
//
//  QUICKSORT.CU
//
//  Implementation of a parallel quicksort in CUDA. It comes in
//  several parts:
//
//  1. A small-set insertion sort. We do this on any set with <=32 elements
//  2. A partitioning kernel, which - given a pivot - separates an input
//     array into elements <=pivot, and >pivot. Two quicksorts will then
//     be launched to resolve each of these.
//  3. A quicksort co-ordinator, which figures out what kernels to launch
//     and when.
//
////////////////////////////////////////////////////////////////////////////////
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>
#include <helper_string.h>
#include "cdpQuicksort.h"

////////////////////////////////////////////////////////////////////////////////
// Inline PTX call to return index of highest non-zero bit in a word
////////////////////////////////////////////////////////////////////////////////
static __device__ __forceinline__ unsigned int __qsflo(unsigned int word)
{
    unsigned int ret;
    asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word));
    return ret;
}

////////////////////////////////////////////////////////////////////////////////
//
//  ringbufAlloc
//
//  Allocates from a ringbuffer. Allows for not failing when we run out
//  of stack for tracking the offset counts for each sort subsection.
//
//  We use the atomicMax trick to allow out-of-order retirement. If we
//  hit the size limit on the ringbuffer, then we spin-wait for people
//  to complete.
//
////////////////////////////////////////////////////////////////////////////////
template< typename T >
static __device__ T *ringbufAlloc(qsortRingbuf *ringbuf)
{
    // Wait for there to be space in the ring buffer. We'll retry only a fixed
    // number of times and then fail, to avoid an out-of-memory deadlock.
    unsigned int loop = 10000;

    while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && (loop-- > 0));

    if (loop == 0)
        return NULL;

    // Note that the element includes a little index book-keeping, for freeing later.
    unsigned int index = atomicAdd((unsigned int *) &ringbuf->head, 1);
    T *ret = (T *)(ringbuf->stackbase) + (index & (ringbuf->stacksize-1));
    ret->index = index;

    return ret;
}

////////////////////////////////////////////////////////////////////////////////
//
//  ringBufFree
//
//  Releases an element from the ring buffer. If every element is released
//  up to and including this one, we can advance the tail to indicate that
//  space is now available.
//
////////////////////////////////////////////////////////////////////////////////
template< typename T >
static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data)
{
    unsigned int index = data->index;       // Non-wrapped index to free
    unsigned int count = atomicAdd((unsigned int *)&(ringbuf->count), 1) + 1;
    unsigned int max = atomicMax((unsigned int *)&(ringbuf->max), index + 1);

    // Update the tail if need be. Note we update "max" to be the new value in ringbuf->max
    if (max < (index+1)) max = index+1;

    if (max == count)
        atomicMax((unsigned int *)&(ringbuf->tail), count);
}

////////////////////////////////////////////////////////////////////////////////
//
//  qsort_warp
//
//  Simplest possible implementation, does a per-warp quicksort with no inter-warp
//  communication. This has a high atomic issue rate, but the rest should actually
//  be fairly quick because of low work per thread.
//
//  A warp finds its section of the data, then writes all data <pivot to one
//  buffer and all data >pivot to the other. Atomics are used to get a unique
//  section of the buffer.
//
//  Obvious optimisation: do multiple chunks per warp, to increase in-flight loads
//  and cover the instruction overhead.
//
////////////////////////////////////////////////////////////////////////////////
__global__ void qsort_warp(unsigned *indata,
                           unsigned *outdata,
                           unsigned int offset,
                           unsigned int len,
                           qsortAtomicData *atomicData,
                           qsortRingbuf *atomicDataStack,
                           unsigned int source_is_indata,
                           unsigned int depth)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Find my data offset, based on warp ID
    unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT);
    //unsigned int warp_id = threadIdx.x >> 5;   // Used for debug only
    unsigned int lane_id = threadIdx.x & (warpSize-1);

    // Exit if I'm outside the range of sort to be done
    if (thread_id >= len)
        return;

    //
    // First part of the algorithm. Each warp counts the number of elements that are
    // greater/less than the pivot.
    //
    // When a warp knows its count, it updates an atomic counter.
    //

    // Read in the data and the pivot. Arbitrary pivot selection for now.
    unsigned pivot = indata[offset + len/2];
    unsigned data  = indata[offset + thread_id];

    // Count how many are <= and how many are > pivot.
    // If all are <= pivot then we adjust the comparison
    // because otherwise the sort will move nothing and
    // we'll iterate forever.
    cg::coalesced_group active = cg::coalesced_threads();
    unsigned int greater = (data > pivot);
    unsigned int gt_mask = active.ballot(greater);

    if (gt_mask == 0)
    {
        greater = (data >= pivot);
        gt_mask = active.ballot(greater);    // Must re-ballot for adjusted comparator
    }

    unsigned int lt_mask = active.ballot(!greater);
    unsigned int gt_count = __popc(gt_mask);
    unsigned int lt_count = __popc(lt_mask);

    // Atomically adjust the lt_ and gt_offsets by this amount. Only one thread need do this. Share the result using shfl
    unsigned int lt_offset, gt_offset;

    if (lane_id == 0)
    {
        if (lt_count > 0)
            lt_offset = atomicAdd((unsigned int *) &atomicData->lt_offset, lt_count);

        if (gt_count > 0)
            gt_offset = len - (atomicAdd((unsigned int *) &atomicData->gt_offset, gt_count) + gt_count);
    }

    lt_offset = active.shfl((int)lt_offset, 0);   // Everyone pulls the offsets from lane 0
    gt_offset = active.shfl((int)gt_offset, 0);

    // Now compute my own personal offset within this. I need to know how many
    // threads with a lane ID less than mine are going to write to the same buffer
    // as me. We can use popc to implement a single-operation warp scan in this case.
    unsigned lane_mask_lt;
    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
    unsigned int my_mask = greater ? gt_mask : lt_mask;
    unsigned int my_offset = __popc(my_mask & lane_mask_lt);

    // Move data.
    my_offset += greater ? gt_offset : lt_offset;
    outdata[offset + my_offset] = data;


    // Count up if we're the last warp in. If so, then Kepler will launch the next
    // set of sorts directly from here.
    if (lane_id == 0)
    {
        // Count "elements written". If I wrote the last one, then trigger the next qsorts
        unsigned int mycount = lt_count + gt_count;

        if (atomicAdd((unsigned int *) &atomicData->sorted_count, mycount) + mycount == len)
        {
            // We're the last warp to do any sorting. Therefore it's up to us to launch the next stage.
            unsigned int lt_len = atomicData->lt_offset;
            unsigned int gt_len = atomicData->gt_offset;

            cudaStream_t lstream, rstream;
            cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking);
            cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking);

            // Begin by freeing our atomicData storage. It's better for the ringbuffer algorithm
            // if we free when we're done, rather than re-using (makes for less fragmentation).
            ringbufFree<qsortAtomicData>(atomicDataStack, atomicData);

            // Exceptional case: if "lt_len" is zero, then all values in the batch
            // are equal. We are then done (may need to copy into correct buffer, though)
            if (lt_len == 0)
            {
                if (source_is_indata)
                    cudaMemcpyAsync(indata+offset, outdata+offset, gt_len*sizeof(unsigned), cudaMemcpyDeviceToDevice, lstream);

                return;
            }

            // Start with lower half first
            if (lt_len > BITONICSORT_LEN)
            {
                // If we've exceeded maximum depth, fall through to backup big_bitonicsort
                if (depth >= QSORT_MAXDEPTH)
                {
                    // The final bitonic stage sorts in-place in "outdata". We therefore
                    // re-use "indata" as the out-of-range tracking buffer. For (2^n)+1
                    // elements we need (2^(n+1)) bytes of oor buffer. The backup qsort
                    // buffer is at least this large when sizeof(QTYPE) >= 2.
                    big_bitonicsort<<< 1, BITONICSORT_LEN, 0, lstream >>>(outdata, source_is_indata ? indata : outdata, indata, offset, lt_len);
                }
                else
                {
                    // Launch another quicksort. We need to allocate more storage for the atomic data.
                    if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)
                        printf("Stack-allocation error. Failing left child launch.\n");
                    else
                    {
                        atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;
                        unsigned int numblocks = (unsigned int)(lt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
                        qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, lstream >>>(outdata, indata, offset, lt_len, atomicData, atomicDataStack, !source_is_indata, depth+1);
                    }
                }
            }
            else if (lt_len > 1)
            {
                // Final stage uses a bitonic sort instead. It's important to
                // make sure the final stage ends up in the correct (original) buffer.
                // We launch the smallest power-of-2 number of threads that we can.
                unsigned int bitonic_len = 1 << (__qsflo(lt_len-1U)+1);
                bitonicsort<<< 1, bitonic_len, 0, lstream >>>(outdata, source_is_indata ? indata : outdata, offset, lt_len);
            }
            // Finally, if we sorted just one single element, we must still make
            // sure that it winds up in the correct place.
            else if (source_is_indata && (lt_len == 1))
                indata[offset] = outdata[offset];

            if (cudaPeekAtLastError() != cudaSuccess)
                printf("Left-side launch fail: %s\n", cudaGetErrorString(cudaGetLastError()));


            // Now the upper half.
            if (gt_len > BITONICSORT_LEN)
            {
                // If we've exceeded maximum depth, fall through to backup big_bitonicsort
                if (depth >= QSORT_MAXDEPTH)
                    big_bitonicsort<<< 1, BITONICSORT_LEN, 0, rstream >>>(outdata, source_is_indata ? indata : outdata, indata, offset+lt_len, gt_len);
                else
                {
                    // Allocate new atomic storage for this launch
                    if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) == NULL)
                        printf("Stack allocation error! Failing right-side launch.\n");
                    else
                    {
                        atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0;
                        unsigned int numblocks = (unsigned int)(gt_len+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
                        qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, rstream >>>(outdata, indata, offset+lt_len, gt_len, atomicData, atomicDataStack, !source_is_indata, depth+1);
                    }
                }
            }
            else if (gt_len > 1)
            {
                unsigned int bitonic_len = 1 << (__qsflo(gt_len-1U)+1);
                bitonicsort<<< 1, bitonic_len, 0, rstream >>>(outdata, source_is_indata ? indata : outdata, offset+lt_len, gt_len);
            }
            else if (source_is_indata && (gt_len == 1))
                indata[offset+lt_len] = outdata[offset+lt_len];

            if (cudaPeekAtLastError() != cudaSuccess)
                printf("Right-side launch fail: %s\n", cudaGetErrorString(cudaGetLastError()));
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
//
//  run_quicksort
//
//  Host-side code to run the Kepler version of quicksort. It's pretty
//  simple, because all launch control is handled on the device via CDP.
//
//  All parallel quicksorts require an equal-sized scratch buffer. This
//  must be passed in ahead of time.
//
//  Returns the time elapsed for the sort.
//
////////////////////////////////////////////////////////////////////////////////
float run_quicksort_cdp(unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudaStream_t stream)
{
    unsigned int stacksize = QSORT_STACK_ELEMS;

    // This is the stack, for atomic tracking of each sort's status
    qsortAtomicData *gpustack;
    checkCudaErrors(cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData)));
    checkCudaErrors(cudaMemset(gpustack, 0, sizeof(qsortAtomicData)));     // Only need set first entry to 0

    // Create the memory ringbuffer used for handling the stack.
    // Initialise everything to where it needs to be.
    qsortRingbuf buf;
    qsortRingbuf *ringbuf;
    checkCudaErrors(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf)));
    buf.head = 1;           // We start with one allocation
    buf.tail = 0;
    buf.count = 0;
    buf.max = 0;
    buf.stacksize = stacksize;
    buf.stackbase = gpustack;
    checkCudaErrors(cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice));


    // Timing events...
    cudaEvent_t ev1, ev2;
    checkCudaErrors(cudaEventCreate(&ev1));
    checkCudaErrors(cudaEventCreate(&ev2));
    checkCudaErrors(cudaEventRecord(ev1));

    // Now we trivially launch the qsort kernel
    if (count > BITONICSORT_LEN)
    {
        unsigned int numblocks = (unsigned int)(count+(QSORT_BLOCKSIZE-1)) >> QSORT_BLOCKSIZE_SHIFT;
        qsort_warp<<< numblocks, QSORT_BLOCKSIZE, 0, stream >>>(gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0);
    }
    else
    {
        bitonicsort<<< 1, BITONICSORT_LEN >>>(gpudata, gpudata, 0, count);
    }

    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaEventRecord(ev2));
    checkCudaErrors(cudaDeviceSynchronize());

    float elapse=0.0f;

    if (cudaPeekAtLastError() != cudaSuccess)
        printf("Launch failure: %s\n", cudaGetErrorString(cudaGetLastError()));
    else
        checkCudaErrors(cudaEventElapsedTime(&elapse, ev1, ev2));

    // Sanity check that the stack allocator is doing the right thing
    checkCudaErrors(cudaMemcpy(&buf, ringbuf, sizeof(*ringbuf), cudaMemcpyDeviceToHost));

    if (count > BITONICSORT_LEN && buf.head != buf.tail)
    {
        printf("Stack allocation error!\nRingbuf:\n");
        printf("\t head = %u\n", buf.head);
        printf("\t tail = %u\n", buf.tail);
        printf("\tcount = %u\n", buf.count);
        printf("\t  max = %u\n", buf.max);
    }

    // Release our stack data once we're done
    checkCudaErrors(cudaFree(ringbuf));
    checkCudaErrors(cudaFree(gpustack));

    return elapse;
}

////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
int run_qsort(unsigned int size, int seed, int debug, int loop, int verbose)
{
    if (seed > 0)
        srand(seed);

    // Create and set up our test
    unsigned *gpudata, *scratchdata;
    checkCudaErrors(cudaMalloc((void **)&gpudata, size*sizeof(unsigned)));
    checkCudaErrors(cudaMalloc((void **)&scratchdata, size*sizeof(unsigned)));

    // Create CPU data.
    unsigned *data = new unsigned[size];
    unsigned int min = loop ? loop : size;
    unsigned int max = size;
    loop = (loop == 0) ? 1 : loop;

    for (size=min; size<=max; size+=loop)
    {
        if (verbose)
            printf(" Input: ");

        for (unsigned int i=0; i<size; i++)
        {
            // Build data 8 bits at a time
            data[i] = 0;
            char *ptr = (char *)&(data[i]);

            for (unsigned j=0; j<sizeof(unsigned); j++)
            {
                // Easy-to-read data in debug mode
                if (debug)
                {
                    *ptr++ = (char)(rand() % 10);
                    break;
                }

                *ptr++ = (char)(rand() & 255);
            }

            if (verbose)
            {
                if (i && !(i%32))
                    printf("\n        ");

                printf("%u ", data[i]);
            }
        }

        if (verbose)
            printf("\n");

        checkCudaErrors(cudaMemcpy(gpudata, data, size*sizeof(unsigned), cudaMemcpyHostToDevice));

        // So we're now populated and ready to go! We size our launch as
        // blocks of up to BLOCKSIZE threads, and appropriate grid size.
        // One thread is launched per element.
        float elapse;
        elapse = run_quicksort_cdp(gpudata, scratchdata, size, NULL);

        //run_bitonicsort<SORTTYPE>(gpudata, scratchdata, size, verbose);
        checkCudaErrors(cudaDeviceSynchronize());

        // Copy back the data and verify correct sort
        checkCudaErrors(cudaMemcpy(data, gpudata, size*sizeof(unsigned), cudaMemcpyDeviceToHost));

        if (verbose)
        {
            printf("Output: ");

            for (unsigned int i=0; i<size; i++)
            {
                if (i && !(i%32)) printf("\n        ");

                printf("%u ", data[i]);
            }

            printf("\n");
        }

        unsigned int check;

        for (check=1; check<size; check++)
        {
            if (data[check] < data[check-1])
            {
                printf("FAILED at element: %d\n", check);
                break;
            }
        }

        if (check != size)
        {
            printf("    cdpAdvancedQuicksort FAILED\n");
            exit(EXIT_FAILURE);
        }
        else
            printf("    cdpAdvancedQuicksort PASSED\n");

        // Display the time between event recordings
        printf("Sorted %u elems in %.3f ms (%.3f Melems/sec)\n", size, elapse, (float)size/(elapse*1000.0f));
        fflush(stdout);
    }

    // Release everything and we're done
    checkCudaErrors(cudaFree(scratchdata));
    checkCudaErrors(cudaFree(gpudata));
    delete(data);
    return 0;
}

static void usage()
{
    printf("Syntax: qsort [-size=<num>] [-seed=<num>] [-debug] [-loop-step=<num>] [-verbose]\n");
    printf("If loop_step is non-zero, will run from 1->array_len in steps of loop_step\n");
}


// Host side entry
int main(int argc, char *argv[])
{
    int size = 5000;     // TODO: make this 1e6
    unsigned int seed = 100;    // TODO: make this 0
    int debug = 0;
    int loop = 0;
    int verbose = 0;

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
        checkCmdLineFlag(argc, (const char **)argv, "h"))
    {
        usage();
        printf("&&&& cdpAdvancedQuicksort WAIVED\n");
        exit(EXIT_WAIVED);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "size"))
    {
        size = getCmdLineArgumentInt(argc, (const char **)argv, "size");
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "seed"))
    {
        seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed");
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "loop-step"))
    {
        loop = getCmdLineArgumentInt(argc, (const char **)argv, "loop-step");
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "debug"))
    {
        debug = 1;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "verbose"))
    {
        verbose = 1;
    }

    // Get device properties
    int cuda_device = findCudaDevice(argc, (const char **)argv);
    cudaDeviceProp properties;
    checkCudaErrors(cudaGetDeviceProperties(&properties, cuda_device));
    int cdpCapable = (properties.major == 3 && properties.minor >= 5) || properties.major >=4;

    printf("GPU device %s has compute capabilities (SM %d.%d)\n", properties.name, properties.major, properties.minor);

    if (!cdpCapable)
    {
        printf("cdpAdvancedQuicksort requires SM 3.5 or higher to use CUDA Dynamic Parallelism.  Exiting...\n");
        exit(EXIT_WAIVED);
    }

    printf("Running qsort on %d elements with seed %d, on %s\n", size, seed, properties.name);

    run_qsort(size, seed, debug, loop, verbose);

    exit(EXIT_SUCCESS);
}


/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <stdio.h>
#include <cuda_runtime_api.h>
#include <helper_cuda.h>
#include <string.h>

__forceinline__ __device__ float2 operator+(float2 a, float2 b)
{
    float2 c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

__forceinline__ __device__ float2 operator-(float2 a, float2 b)
{
    float2 c;
    c.x = a.x - b.x;
    c.y = a.y - b.y;
    return c;
}

__forceinline__ __device__ float2 operator*(float a, float2 b)
{
    float2 c;
    c.x = a * b.x;
    c.y = a * b.y;
    return c;
}

__forceinline__ __device__ float length(float2 a)
{
    return sqrtf(a.x*a.x + a.y*a.y);
}

#define MAX_TESSELLATION 32
struct BezierLine
{
    float2 CP[3];
    float2 *vertexPos;
    int nVertices;
};

__global__ void computeBezierLinePositions(int lidx, BezierLine *bLines, int nTessPoints)
{
    int idx = threadIdx.x + blockDim.x*blockIdx.x;

    if (idx < nTessPoints)
    {
        float u = (float)idx/(float)(nTessPoints-1);
        float omu = 1.0f - u;

        float B3u[3];

        B3u[0] = omu*omu;
        B3u[1] = 2.0f*u*omu;
        B3u[2] = u*u;

        float2 position = {0,0};

        for (int i = 0; i < 3; i++)
        {
            position = position + B3u[i] * bLines[lidx].CP[i];
        }

        bLines[lidx].vertexPos[idx] = position;
    }
}

__global__ void computeBezierLinesCDP(BezierLine *bLines, int nLines)
{
    int lidx = threadIdx.x + blockDim.x*blockIdx.x;

    if (lidx < nLines)
    {
        float curvature = length(bLines[lidx].CP[1] - 0.5f*(bLines[lidx].CP[0] + bLines[lidx].CP[2]))/length(bLines[lidx].CP[2] - bLines[lidx].CP[0]);
        int nTessPoints = min(max((int)(curvature*16.0f),4),MAX_TESSELLATION);

        if (bLines[lidx].vertexPos == NULL)
        {
            bLines[lidx].nVertices = nTessPoints;
            cudaMalloc((void **)&bLines[lidx].vertexPos, nTessPoints*sizeof(float2));
        }

        computeBezierLinePositions<<<ceil((float)bLines[lidx].nVertices/32.0f), 32>>>(lidx, bLines, bLines[lidx].nVertices);
    }
}

__global__ void freeVertexMem(BezierLine *bLines, int nLines)
{
    int lidx = threadIdx.x + blockDim.x*blockIdx.x;

    if (lidx < nLines)
        cudaFree(bLines[lidx].vertexPos);
}

unsigned int checkCapableSM35Device(int argc, char** argv)
{
    // Get device properties
    cudaDeviceProp properties;
    int device_count = 0, device = -1;

    if(checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        device = getCmdLineArgumentInt(argc, (const char **)argv, "device");

        cudaDeviceProp properties;
        checkCudaErrors(cudaGetDeviceProperties(&properties, device));

        if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
        {
            printf("Running on GPU  %d (%s)\n", device , properties.name);
        }
        else
        {
            printf("cdpBezierTessellation requires GPU devices with compute SM 3.5 or higher.");
            printf("Current GPU device has compute SM %d.%d. Exiting...\n",properties.major, properties.minor);
            return EXIT_FAILURE;
        }

    }
    else
    {

        checkCudaErrors(cudaGetDeviceCount(&device_count));

        for (int i=0; i < device_count; ++i)
        {
            checkCudaErrors(cudaGetDeviceProperties(&properties, i));

            if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
            {
                device = i;
                printf("Running on GPU %d (%s)\n", i, properties.name);
                break;
            }

            printf("GPU %d %s does not support CUDA Dynamic Parallelism\n", i, properties.name);
        }
    }
    if (device == -1)
    {
        fprintf(stderr, "cdpBezierTessellation requires GPU devices with compute SM 3.5 or higher.  Exiting...\n");
        return EXIT_WAIVED;
    }

    return EXIT_SUCCESS;
}


#define N_LINES 256
#define BLOCK_DIM 64
int main(int argc, char **argv)
{
    BezierLine *bLines_h = new BezierLine[N_LINES];

    float2 last = {0,0};

    for (int i = 0; i < N_LINES; i++)
    {
        bLines_h[i].CP[0] = last;

        for (int j = 1; j < 3; j++)
        {
            bLines_h[i].CP[j].x = (float)rand()/(float)RAND_MAX;
            bLines_h[i].CP[j].y = (float)rand()/(float)RAND_MAX;
        }

        last = bLines_h[i].CP[2];
        bLines_h[i].vertexPos = NULL;
        bLines_h[i].nVertices = 0;
    }

    unsigned int sm35Ret = checkCapableSM35Device(argc, argv);
    if (sm35Ret != EXIT_SUCCESS)
    {
        exit(sm35Ret);
    }

    BezierLine *bLines_d;
    checkCudaErrors(cudaMalloc((void **)&bLines_d, N_LINES*sizeof(BezierLine)));
    checkCudaErrors(cudaMemcpy(bLines_d, bLines_h, N_LINES*sizeof(BezierLine), cudaMemcpyHostToDevice));
    printf("Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... ");
    computeBezierLinesCDP<<< (unsigned int)ceil((float)N_LINES/(float)BLOCK_DIM), BLOCK_DIM >>>(bLines_d, N_LINES);
    printf("Done!\n");

    //Do something to draw the lines here

    freeVertexMem<<< (unsigned int)ceil((float)N_LINES/(float)BLOCK_DIM), BLOCK_DIM >>>(bLines_d, N_LINES);
    checkCudaErrors(cudaFree(bLines_d));
    delete[] bLines_h;

    exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


#include <assert.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include "scan_common.h"

//All three kernels run 512 threads per workgroup
//Must be a power of two
#define THREADBLOCK_SIZE 256

////////////////////////////////////////////////////////////////////////////////
// Basic scan codelets
////////////////////////////////////////////////////////////////////////////////
//Naive inclusive scan: O(N * log2(N)) operations
//Allocate 2 * 'size' local memory, initialize the first half
//with 'size' zeros avoiding if(pos >= offset) condition evaluation
//and saving instructions
inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size, cg::thread_block cta)
{
    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
    s_Data[pos] = 0;
    pos += size;
    s_Data[pos] = idata;

    for (uint offset = 1; offset < size; offset <<= 1)
    {
        cg::sync(cta);
        uint t = s_Data[pos] + s_Data[pos - offset];
        cg::sync(cta);
        s_Data[pos] = t;
    }

    return s_Data[pos];
}

inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size, cg::thread_block cta)
{
    return scan1Inclusive(idata, s_Data, size, cta) - idata;
}


inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size, cg::thread_block cta)
{
    //Level-0 inclusive scan
    idata4.y += idata4.x;
    idata4.z += idata4.y;
    idata4.w += idata4.z;

    //Level-1 exclusive scan
    uint oval = scan1Exclusive(idata4.w, s_Data, size / 4, cta);

    idata4.x += oval;
    idata4.y += oval;
    idata4.z += oval;
    idata4.w += oval;

    return idata4;
}

//Exclusive vector scan: the array to be scanned is stored
//in local thread memory scope as uint4
inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size, cg::thread_block cta)
{
    uint4 odata4 = scan4Inclusive(idata4, s_Data, size, cta);
    odata4.x -= idata4.x;
    odata4.y -= idata4.y;
    odata4.z -= idata4.z;
    odata4.w -= idata4.w;
    return odata4;
}

////////////////////////////////////////////////////////////////////////////////
// Scan kernels
////////////////////////////////////////////////////////////////////////////////
__global__ void scanExclusiveShared(
    uint4 *d_Dst,
    uint4 *d_Src,
    uint size
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint s_Data[2 * THREADBLOCK_SIZE];

    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

    //Load data
    uint4 idata4 = d_Src[pos];

    //Calculate exclusive scan
    uint4 odata4 = scan4Exclusive(idata4, s_Data, size, cta);

    //Write back
    d_Dst[pos] = odata4;
}

//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
__global__ void scanExclusiveShared2(
    uint *d_Buf,
    uint *d_Dst,
    uint *d_Src,
    uint N,
    uint arrayLength
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint s_Data[2 * THREADBLOCK_SIZE];

    //Skip loads and stores for inactive threads of last threadblock (pos >= N)
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

    //Load top elements
    //Convert results of bottom-level scan back to inclusive
    uint idata = 0;

    if (pos < N)
        idata =
            d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos] +
            d_Src[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos];

    //Compute
    uint odata = scan1Exclusive(idata, s_Data, arrayLength, cta);

    //Avoid out-of-bound access
    if (pos < N)
    {
        d_Buf[pos] = odata;
    }
}

//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
__global__ void uniformUpdate(
    uint4 *d_Data,
    uint *d_Buffer
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint buf;
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

    if (threadIdx.x == 0)
    {
        buf = d_Buffer[blockIdx.x];
    }

    cg::sync(cta);

    uint4 data4 = d_Data[pos];
    data4.x += buf;
    data4.y += buf;
    data4.z += buf;
    data4.w += buf;
    d_Data[pos] = data4;
}

////////////////////////////////////////////////////////////////////////////////
// Interface function
////////////////////////////////////////////////////////////////////////////////
//Derived as 32768 (max power-of-two gridDim.x) * 4 * THREADBLOCK_SIZE
//Due to scanExclusiveShared<<<>>>() 1D block addressing
extern "C" const uint MAX_BATCH_ELEMENTS = 64 * 1048576;
extern "C" const uint MIN_SHORT_ARRAY_SIZE = 4;
extern "C" const uint MAX_SHORT_ARRAY_SIZE = 4 * THREADBLOCK_SIZE;
extern "C" const uint MIN_LARGE_ARRAY_SIZE = 8 * THREADBLOCK_SIZE;
extern "C" const uint MAX_LARGE_ARRAY_SIZE = 4 * THREADBLOCK_SIZE * THREADBLOCK_SIZE;

//Internal exclusive scan buffer
static uint *d_Buf;

extern "C" void initScan(void)
{
    checkCudaErrors(cudaMalloc((void **)&d_Buf, (MAX_BATCH_ELEMENTS / (4 * THREADBLOCK_SIZE)) * sizeof(uint)));
}

extern "C" void closeScan(void)
{
    checkCudaErrors(cudaFree(d_Buf));
}

static uint factorRadix2(uint &log2L, uint L)
{
    if (!L)
    {
        log2L = 0;
        return 0;
    }
    else
    {
        for (log2L = 0; (L & 1) == 0; L >>= 1, log2L++);

        return L;
    }
}

static uint iDivUp(uint dividend, uint divisor)
{
    return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1);
}

extern "C" size_t scanExclusiveShort(
    uint *d_Dst,
    uint *d_Src,
    uint batchSize,
    uint arrayLength
)
{
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    assert(factorizationRemainder == 1);

    //Check supported size range
    assert((arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE));

    //Check total batch size limit
    assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS);

    //Check all threadblocks to be fully packed with data
    assert((batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0);

    scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
        (uint4 *)d_Dst,
        (uint4 *)d_Src,
        arrayLength
    );
    getLastCudaError("scanExclusiveShared() execution FAILED\n");

    return THREADBLOCK_SIZE;
}

extern "C" size_t scanExclusiveLarge(
    uint *d_Dst,
    uint *d_Src,
    uint batchSize,
    uint arrayLength
)
{
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    assert(factorizationRemainder == 1);

    //Check supported size range
    assert((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE));

    //Check total batch size limit
    assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS);

    scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
        (uint4 *)d_Dst,
        (uint4 *)d_Src,
        4 * THREADBLOCK_SIZE
    );
    getLastCudaError("scanExclusiveShared() execution FAILED\n");

    //Not all threadblocks need to be packed with input data:
    //inactive threads of highest threadblock just don't do global reads and writes
    const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE);
    scanExclusiveShared2<<< blockCount2, THREADBLOCK_SIZE>>>(
        (uint *)d_Buf,
        (uint *)d_Dst,
        (uint *)d_Src,
        (batchSize *arrayLength) / (4 * THREADBLOCK_SIZE),
        arrayLength / (4 * THREADBLOCK_SIZE)
    );
    getLastCudaError("scanExclusiveShared2() execution FAILED\n");

    uniformUpdate<<<(batchSize *arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
        (uint4 *)d_Dst,
        (uint *)d_Buf
    );
    getLastCudaError("uniformUpdate() execution FAILED\n");

    return THREADBLOCK_SIZE;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


/* Example of program using the interval_gpu<T> template class and operators:
 * Search for roots of a function using an interval Newton method.
  *
 * Use the command-line argument "--n=<N>" to select which GPU implementation to use,
 * otherwise the naive implementation will be used by default.
 * 0: the naive implementation
 * 1: the optimized implementation
 * 2: the recursive implementation
 *
 */

const static char *sSDKsample = "Interval Computing";

#include <iostream>
#include <stdio.h>
#include "helper_cuda.h"
#include "interval.h"
#include "cuda_interval.h"
#include "cpu_interval.h"

int main(int argc,char *argv[])
{
    int implementation_choice = 0;

    printf("[%s]  starting ...\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        implementation_choice = getCmdLineArgumentInt(argc, (const char **) argv, "n");
    }

    // Pick the best GPU available, or if the developer selects one at the command line
    int devID = findCudaDevice(argc, (const char **)argv);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, devID);
    printf("> GPU Device has Compute Capabilities SM %d.%d\n\n", deviceProp.major, deviceProp.minor);

    switch (implementation_choice)
    {
        case 0:
            printf("GPU naive implementation\n");
            break;

        case 1:
            printf("GPU optimized implementation\n");
            break;

        case 2:
            printf("GPU recursive implementation (requires Compute SM 2.0+)\n");
            break;

        default:
            printf("GPU naive implementation\n");
    }

    interval_gpu<T> *d_result;
    int *d_nresults;
    int *h_nresults = new int[THREADS];
    cudaEvent_t start, stop;

    CHECKED_CALL(cudaSetDevice(devID));
    CHECKED_CALL(cudaMalloc((void **)&d_result, THREADS * DEPTH_RESULT * sizeof(*d_result)));
    CHECKED_CALL(cudaMalloc((void **)&d_nresults, THREADS * sizeof(*d_nresults)));
    CHECKED_CALL(cudaEventCreate(&start));
    CHECKED_CALL(cudaEventCreate(&stop));

    // We need L1 cache to store the stack (only applicable to sm_20 and higher)
    CHECKED_CALL(cudaFuncSetCacheConfig(test_interval_newton<T>, cudaFuncCachePreferL1));

    // Increase the stack size large enough for the non-inlined and recursive function calls (only applicable to sm_20 and higher)
    CHECKED_CALL(cudaDeviceSetLimit(cudaLimitStackSize, 8192));

    interval_gpu<T> i(0.01f, 4.0f);
    std::cout << "Searching for roots in [" << i.lower() << ", " << i.upper() << "]...\n";

    CHECKED_CALL(cudaEventRecord(start, 0));

    for (int it = 0; it < NUM_RUNS; ++it)
    {
        test_interval_newton<T><<<GRID_SIZE, BLOCK_SIZE>>>(d_result, d_nresults, i, implementation_choice);
        CHECKED_CALL(cudaGetLastError());
    }

    CHECKED_CALL(cudaEventRecord(stop, 0));
    CHECKED_CALL(cudaDeviceSynchronize());

    I_CPU *h_result = new I_CPU[THREADS * DEPTH_RESULT];
    CHECKED_CALL(cudaMemcpy(h_result, d_result, THREADS * DEPTH_RESULT * sizeof(*d_result), cudaMemcpyDeviceToHost));
    CHECKED_CALL(cudaMemcpy(h_nresults, d_nresults, THREADS * sizeof(*d_nresults), cudaMemcpyDeviceToHost));

    std::cout << "Found " << h_nresults[0] << " intervals that may contain the root(s)\n";
    std::cout.precision(15);

    for (int i = 0; i != h_nresults[0]; ++i)
    {
        std::cout << " i[" << i << "] ="
                  << " [" << h_result[THREADS * i + 0].lower()
                  << ", " << h_result[THREADS * i + 0].upper() << "]\n";
    }

    float time;
    CHECKED_CALL(cudaEventElapsedTime(&time, start, stop));
    std::cout << "Number of equations solved: " << THREADS << "\n";
    std::cout << "Time per equation: " << 1000000.0f * (time / (float)(THREADS)) / NUM_RUNS << " us\n";

    CHECKED_CALL(cudaEventDestroy(start));
    CHECKED_CALL(cudaEventDestroy(stop));
    CHECKED_CALL(cudaFree(d_result));
    CHECKED_CALL(cudaFree(d_nresults));

    // Compute the results using a CPU implementation based on the Boost library
    I_CPU i_cpu(0.01f, 4.0f);
    I_CPU *h_result_cpu = new I_CPU[THREADS * DEPTH_RESULT];
    int *h_nresults_cpu = new int[THREADS];
    test_interval_newton_cpu<I_CPU>(h_result_cpu, h_nresults_cpu, i_cpu);

    // Compare the CPU and GPU results
    bool bTestResult = checkAgainstHost(h_nresults, h_nresults_cpu, h_result, h_result_cpu);

    delete [] h_result_cpu;
    delete [] h_nresults_cpu;
    delete [] h_result;
    delete [] h_nresults;

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// std::system includes
#include <cstdio>

// CUDA-C includes
#include <cuda_runtime.h>

#include <helper_cuda.h>

#define TOTAL_SIZE  256*1024*1024
#define EACH_SIZE   128*1024*1024

// # threadblocks
#define TBLOCKS 1024
#define THREADS  512

// throw error on equality
#define ERR_EQ(X,Y) do { if ((X) == (Y)) { \
            fprintf(stderr,"Error in %s at %s:%d\n",__func__,__FILE__,__LINE__); \
            exit(-1);}} while(0)

// throw error on difference
#define ERR_NE(X,Y) do { if ((X) != (Y)) { \
            fprintf(stderr,"Error in %s at %s:%d\n",__func__,__FILE__,__LINE__); \
            exit(-1);}} while(0)

// copy from source -> destination arrays
__global__ void memcpy_kernel(int *dst, int *src, size_t n)
{
    int num = gridDim.x * blockDim.x;
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    for (int i = id; i < n / sizeof(int); i += num)
    {
        dst[i] = src[i];
    }
}

// initialise memory
void mem_init(int *buf, size_t n)
{
    for (int i = 0; i < n / sizeof(int); i++)
    {
        buf[i] = i;
    }
}

int main(int argc, char **argv)
{
    cudaDeviceProp device_prop;
    int dev_id;

    printf("Starting [%s]...\n", argv[0]);

    // set device
    dev_id = findCudaDevice(argc, (const char **) argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

    if ((device_prop.major << 4) + device_prop.minor < 0x35)
    {
        fprintf(stderr, "%s requires Compute Capability of SM 3.5 or higher to run.\nexiting...\n", argv[0]);
        exit(EXIT_WAIVED);
    }

    // get the range of priorities available
    // [ greatest_priority, lowest_priority ]
    int priority_low;
    int priority_hi;
    checkCudaErrors(cudaDeviceGetStreamPriorityRange(&priority_low, &priority_hi));

    printf("CUDA stream priority range: LOW: %d to HIGH: %d\n",priority_low,priority_hi);

    // create streams with highest and lowest available priorities
    cudaStream_t st_low;
    cudaStream_t st_hi;
    checkCudaErrors(cudaStreamCreateWithPriority(&st_low, cudaStreamNonBlocking, priority_low));
    checkCudaErrors(cudaStreamCreateWithPriority(&st_hi,  cudaStreamNonBlocking, priority_hi));

    size_t size;
    size = TOTAL_SIZE;

    // initialise host data
    int *h_src_low;
    int *h_src_hi;
    ERR_EQ(h_src_low = (int *) malloc(size), NULL);
    ERR_EQ(h_src_hi  = (int *) malloc(size), NULL);
    mem_init(h_src_low, size);
    mem_init(h_src_hi,  size);

    // initialise device data
    int *h_dst_low;
    int *h_dst_hi;
    ERR_EQ(h_dst_low = (int *) malloc(size), NULL);
    ERR_EQ(h_dst_hi  = (int *) malloc(size), NULL);
    memset(h_dst_low, 0, size);
    memset(h_dst_hi,  0, size);

    // copy source data -> device
    int *d_src_low;
    int *d_src_hi;
    checkCudaErrors(cudaMalloc(&d_src_low, size));
    checkCudaErrors(cudaMalloc(&d_src_hi,  size));
    checkCudaErrors(cudaMemcpy(d_src_low, h_src_low, size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_src_hi,  h_src_hi,  size, cudaMemcpyHostToDevice));

    // allocate memory for memcopy destination
    int *d_dst_low;
    int *d_dst_hi;
    checkCudaErrors(cudaMalloc(&d_dst_low, size));
    checkCudaErrors(cudaMalloc(&d_dst_hi,  size));

    // create some events
    cudaEvent_t ev_start_low;
    cudaEvent_t ev_start_hi;
    cudaEvent_t ev_end_low;
    cudaEvent_t ev_end_hi;
    checkCudaErrors(cudaEventCreate(&ev_start_low));
    checkCudaErrors(cudaEventCreate(&ev_start_hi));
    checkCudaErrors(cudaEventCreate(&ev_end_low));
    checkCudaErrors(cudaEventCreate(&ev_end_hi));

    /* */

    // call pair of kernels repeatedly (with different priority streams)
    checkCudaErrors(cudaEventRecord(ev_start_low, st_low));
    checkCudaErrors(cudaEventRecord(ev_start_hi,  st_hi));

    for (int i = 0; i < TOTAL_SIZE; i += EACH_SIZE)
    {
        int j = i / sizeof(int);
        memcpy_kernel<<<TBLOCKS, THREADS, 0, st_low>>>(d_dst_low + j, d_src_low + j, EACH_SIZE);
        memcpy_kernel<<<TBLOCKS, THREADS, 0, st_hi >>>(d_dst_hi  + j, d_src_hi  + j, EACH_SIZE);
    }

    checkCudaErrors(cudaEventRecord(ev_end_low, st_low));
    checkCudaErrors(cudaEventRecord(ev_end_hi,  st_hi));

    checkCudaErrors(cudaEventSynchronize(ev_end_low));
    checkCudaErrors(cudaEventSynchronize(ev_end_hi));

    /* */

    size = TOTAL_SIZE;
    checkCudaErrors(cudaMemcpy(h_dst_low, d_dst_low, size, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_dst_hi,  d_dst_hi,  size, cudaMemcpyDeviceToHost));

    // check results of kernels
    ERR_NE(memcmp(h_dst_low, h_src_low, size), 0);
    ERR_NE(memcmp(h_dst_hi,  h_src_hi,  size), 0);

    // check timings
    float ms_low;
    float ms_hi;
    checkCudaErrors(cudaEventElapsedTime(&ms_low, ev_start_low, ev_end_low));
    checkCudaErrors(cudaEventElapsedTime(&ms_hi,  ev_start_hi,  ev_end_hi));

    printf("elapsed time of kernels launched to LOW priority stream: %.3lf ms\n", ms_low);
    printf("elapsed time of kernels launched to HI  priority stream: %.3lf ms\n", ms_hi);

    exit(EXIT_SUCCESS);
}
/*
 * Simple kernel for ptxjit demonstration.
 *
 */
 extern "C" __global__ void myKernel(int *data)
 {
 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
	data[tid] = tid;
 }
/*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample implements a conjugate gradient solver on multiple GPU using
 * Multi Device Cooperative Groups, also uses Unified Memory optimized using
 * prefetching and usage hints.
 *
 */

// includes, system
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <set>

#include <cuda_runtime.h>

// Utilities and system includes
#include <helper_cuda.h>  // helper function CUDA error checking and initialization
#include <helper_functions.h>  // helper for shared functions common to CUDA Samples

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

const char *sSDKname = "conjugateGradientMultiDeviceCG";

#define ENABLE_CPU_DEBUG_CODE 0
#define THREADS_PER_BLOCK 512

__device__ double grid_dot_result = 0.0;

/* genTridiag: generate a random tridiagonal symmetric matrix */
void genTridiag(int *I, int *J, float *val, int N, int nz) {
  I[0] = 0, J[0] = 0, J[1] = 1;
  val[0] = (float)rand() / RAND_MAX + 10.0f;
  val[1] = (float)rand() / RAND_MAX;
  int start;

  for (int i = 1; i < N; i++) {
    if (i > 1) {
      I[i] = I[i - 1] + 3;
    } else {
      I[1] = 2;
    }

    start = (i - 1) * 3 + 2;
    J[start] = i - 1;
    J[start + 1] = i;

    if (i < N - 1) {
      J[start + 2] = i + 1;
    }

    val[start] = val[start - 1];
    val[start + 1] = (float)rand() / RAND_MAX + 10.0f;

    if (i < N - 1) {
      val[start + 2] = (float)rand() / RAND_MAX;
    }
  }

  I[N] = nz;
}

// I - contains location of the given non-zero element in the row of the matrix
// J - contains location of the given non-zero element in the column of the
// matrix val - contains values of the given non-zero elements of the matrix
// inputVecX - input vector to be multiplied
// outputVecY - resultant vector
void cpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha,
             float *inputVecX, float *outputVecY) {
  for (int i = 0; i < num_rows; i++) {
    int num_elems_this_row = I[i + 1] - I[i];

    float output = 0.0;
    for (int j = 0; j < num_elems_this_row; j++) {
      output += alpha * val[I[i] + j] * inputVecX[J[I[i] + j]];
    }
    outputVecY[i] = output;
  }

  return;
}

double dotProduct(float *vecA, float *vecB, int size) {
  double result = 0.0;

  for (int i = 0; i < size; i++) {
    result = result + (vecA[i] * vecB[i]);
  }

  return result;
}

void scaleVector(float *vec, float alpha, int size) {
  for (int i = 0; i < size; i++) {
    vec[i] = alpha * vec[i];
  }
}

void saxpy(float *x, float *y, float a, int size) {
  for (int i = 0; i < size; i++) {
    y[i] = a * x[i] + y[i];
  }
}

void cpuConjugateGrad(int *I, int *J, float *val, float *x, float *Ax, float *p,
                      float *r, int nnz, int N, float tol) {
  int max_iter = 10000;

  float alpha = 1.0;
  float alpham1 = -1.0;
  float r0 = 0.0, b, a, na;

  cpuSpMV(I, J, val, nnz, N, alpha, x, Ax);
  saxpy(Ax, r, alpham1, N);

  float r1 = dotProduct(r, r, N);

  int k = 1;

  while (r1 > tol * tol && k <= max_iter) {
    if (k > 1) {
      b = r1 / r0;
      scaleVector(p, b, N);

      saxpy(r, p, alpha, N);
    } else {
      for (int i = 0; i < N; i++) p[i] = r[i];
    }

    cpuSpMV(I, J, val, nnz, N, alpha, p, Ax);

    float dot = dotProduct(p, Ax, N);
    a = r1 / dot;

    saxpy(p, x, a, N);
    na = -a;
    saxpy(Ax, r, na, N);

    r0 = r1;
    r1 = dotProduct(r, r, N);

    printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1));
    k++;
  }
}

__device__ void gpuSpMV(int *I, int *J, float *val, int nnz, int num_rows,
                        float alpha, float *inputVecX, float *outputVecY,
                        cg::thread_block &cta,
                        const cg::multi_grid_group &multi_grid) {
  for (int i = multi_grid.thread_rank(); i < num_rows; i += multi_grid.size()) {
    int row_elem = I[i];
    int next_row_elem = I[i + 1];
    int num_elems_this_row = next_row_elem - row_elem;

    float output = 0.0;
    for (int j = 0; j < num_elems_this_row; j++) {
      output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]];
    }

    outputVecY[i] = output;
  }
}

__device__ void gpuSaxpy(float *x, float *y, float a, int size,
                         const cg::multi_grid_group &multi_grid) {
  for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
    y[i] = a * x[i] + y[i];
  }
}

__device__ void gpuDotProduct(float *vecA, float *vecB, int size,
                              const cg::thread_block &cta,
                              const cg::multi_grid_group &multi_grid) {
  __shared__ double tmp[THREADS_PER_BLOCK];

  double temp_sum = 0.0;
  for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
    temp_sum += (double)(vecA[i] * vecB[i]);
  }
  tmp[cta.thread_rank()] = temp_sum;

  cg::sync(cta);

  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

  double beta = temp_sum;
  double temp;

  for (int i = tile32.size() / 2; i > 0; i >>= 1) {
    if (tile32.thread_rank() < i) {
      temp = tmp[cta.thread_rank() + i];
      beta += temp;
      tmp[cta.thread_rank()] = beta;
    }
    cg::sync(tile32);
  }
  cg::sync(cta);

  if (cta.thread_rank() == 0) {
    beta = 0.0;
    for (int i = 0; i < cta.size(); i += tile32.size()) {
      beta += tmp[i];
    }
    atomicAdd(&grid_dot_result, beta);
  }
}

__device__ void gpuCopyVector(float *srcA, float *destB, int size,
                              const cg::multi_grid_group &multi_grid) {
  for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
    destB[i] = srcA[i];
  }
}

__device__ void gpuScaleVector(float *vec, float alpha, int size,
                               const cg::multi_grid_group &multi_grid) {
  for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
    vec[i] = alpha * vec[i];
  }
}

__device__ void setDotResultToZero(double *dot_result) {
  unsigned long long int *address_as_ull = (unsigned long long int *)dot_result;
  unsigned long long int old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS_system(address_as_ull, assumed, 0);

  } while (assumed != old);
}

extern "C" __global__ void multiGpuConjugateGradient(
    int *I, int *J, float *val, float *x, float *Ax, float *p, float *r,
    double *dot_result, int nnz, int N, float tol) {
  cg::thread_block cta = cg::this_thread_block();
  cg::grid_group grid = cg::this_grid();
  cg::multi_grid_group multi_grid = cg::this_multi_grid();

  const int max_iter = 10000;

  float alpha = 1.0;
  float alpham1 = -1.0;
  float r0 = 0.0, r1, b, a, na;

  for (int i = multi_grid.thread_rank(); i < N; i += multi_grid.size()) {
    r[i] = 1.0;
    x[i] = 0.0;
  }

  cg::sync(grid);

  gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, cta, multi_grid);

  cg::sync(grid);

  gpuSaxpy(Ax, r, alpham1, N, multi_grid);

  cg::sync(grid);

  gpuDotProduct(r, r, N, cta, multi_grid);

  cg::sync(grid);

  if (grid.thread_rank() == 0) {
    atomicAdd_system(dot_result, grid_dot_result);
    grid_dot_result = 0.0;
  }
  cg::sync(multi_grid);

  r1 = *dot_result;

  int k = 1;
  while (r1 > tol * tol && k <= max_iter) {
    if (k > 1) {
      b = r1 / r0;

      gpuScaleVector(p, b, N, multi_grid);
      cg::sync(grid);
      gpuSaxpy(r, p, alpha, N, multi_grid);
    } else {
      gpuCopyVector(r, p, N, multi_grid);
    }

    cg::sync(multi_grid);

    gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, cta, multi_grid);

    if (multi_grid.thread_rank() == 0) {
      setDotResultToZero(dot_result);
    }
    cg::sync(multi_grid);

    gpuDotProduct(p, Ax, N, cta, multi_grid);

    cg::sync(grid);

    if (grid.thread_rank() == 0) {
      atomicAdd_system(dot_result, grid_dot_result);
      grid_dot_result = 0.0;
    }
    cg::sync(multi_grid);

    a = r1 / *dot_result;

    gpuSaxpy(p, x, a, N, multi_grid);

    na = -a;

    gpuSaxpy(Ax, r, na, N, multi_grid);

    r0 = r1;

    cg::sync(multi_grid);
    if (multi_grid.thread_rank() == 0) {
      setDotResultToZero(dot_result);
    }

    cg::sync(multi_grid);

    gpuDotProduct(r, r, N, cta, multi_grid);

    cg::sync(grid);

    if (grid.thread_rank() == 0) {
      atomicAdd_system(dot_result, grid_dot_result);
      grid_dot_result = 0.0;
    }
    cg::sync(multi_grid);

    r1 = *dot_result;
    k++;
  }
}

void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
  int *major_minor = (int *)malloc(sizeof(int) * num_of_gpus * 2);
  int foundIdenticalGPUs = 0;

  for (int i = 0; i < num_of_gpus; i++) {
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
    major_minor[i * 2] = deviceProp.major;
    major_minor[i * 2 + 1] = deviceProp.minor;
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i,
           deviceProp.name, deviceProp.major, deviceProp.minor);
  }

  int maxMajorMinor[2] = {0, 0};

  for (int i = 0; i < num_of_gpus; i++) {
    for (int j = i + 1; j < num_of_gpus; j++) {
      if ((major_minor[i * 2] == major_minor[j * 2]) &&
          (major_minor[i * 2 + 1] == major_minor[j * 2 + 1])) {
        identicalGPUs.insert(i);
        identicalGPUs.insert(j);
        foundIdenticalGPUs = 1;
        if (maxMajorMinor[0] < major_minor[i * 2] &&
            maxMajorMinor[1] < major_minor[i * 2 + 1]) {
          maxMajorMinor[0] = major_minor[i * 2];
          maxMajorMinor[1] = major_minor[i * 2 + 1];
        }
      }
    }
  }

  free(major_minor);
  if (!foundIdenticalGPUs) {
    printf(
        "No Two or more GPUs with same architecture found\nWaiving the "
        "sample\n");
    exit(EXIT_WAIVED);
  }

  std::set<int>::iterator it = identicalGPUs.begin();

  // Iterate over all the identical GPUs found
  while (it != identicalGPUs.end()) {
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *it));
    // Remove all the GPUs which are less than the best arch available
    if (deviceProp.major != maxMajorMinor[0] &&
        deviceProp.minor != maxMajorMinor[1]) {
      identicalGPUs.erase(it);
    }
    if (!deviceProp.cooperativeMultiDeviceLaunch ||
        !deviceProp.concurrentManagedAccess) {
      identicalGPUs.erase(it);
    }
    it++;
  }

  return;
}

int main(int argc, char **argv) {
  int N = 0, nz = 0, *I = NULL, *J = NULL;
  float *val = NULL;
  const float tol = 1e-5f;
  float *x;
  float rhs = 1.0;
  float r1;
  float *r, *p, *Ax;

  printf("Starting [%s]...\n", sSDKname);

  int num_of_gpus = 0;
  checkCudaErrors(cudaGetDeviceCount(&num_of_gpus));

  if (num_of_gpus <= 1) {
    printf("No. of GPU on node %d\n", num_of_gpus);
    printf("Minimum Two or more GPUs are required to run this sample code\n");
    exit(EXIT_WAIVED);
  }

  std::set<int> identicalGPUs;
  getIdenticalGPUs(num_of_gpus, identicalGPUs);

  if (identicalGPUs.size() <= 1) {
    printf(
        "No Two or more GPUs with same architecture capable of "
        "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. \nWaiving the sample\n");
    exit(EXIT_WAIVED);
  }

  std::set<int>::iterator deviceId = identicalGPUs.begin();

  // We use only 2 GPUs as for input size of N = 10485760*2 two GPUs are enough.
  while (identicalGPUs.size() > 2) {
    identicalGPUs.erase(deviceId);
    deviceId++;
  }
  /* Generate a random tridiagonal symmetric matrix in CSR format */
  N = 10485760 * 2;
  nz = (N - 2) * 3 + 4;

  checkCudaErrors(cudaMallocManaged((void **)&I, sizeof(int) * (N + 1)));
  checkCudaErrors(cudaMallocManaged((void **)&J, sizeof(int) * nz));
  checkCudaErrors(cudaMallocManaged((void **)&val, sizeof(float) * nz));

  float *val_cpu = (float *)malloc(sizeof(float) * nz);

  genTridiag(I, J, val_cpu, N, nz);

  memcpy(val, val_cpu, sizeof(float) * nz);
  checkCudaErrors(
      cudaMemAdvise(I, sizeof(int) * (N + 1), cudaMemAdviseSetReadMostly, 0));
  checkCudaErrors(
      cudaMemAdvise(J, sizeof(int) * nz, cudaMemAdviseSetReadMostly, 0));
  checkCudaErrors(
      cudaMemAdvise(val, sizeof(float) * nz, cudaMemAdviseSetReadMostly, 0));

  checkCudaErrors(cudaMallocManaged((void **)&x, sizeof(float) * N));

  double *dot_result;
  checkCudaErrors(cudaMallocManaged((void **)&dot_result, sizeof(double)));

  checkCudaErrors(cudaMemset(dot_result, 0.0, sizeof(double)));

  // temp memory for ConjugateGradient
  checkCudaErrors(cudaMallocManaged((void **)&r, N * sizeof(float)));
  checkCudaErrors(cudaMallocManaged((void **)&p, N * sizeof(float)));
  checkCudaErrors(cudaMallocManaged((void **)&Ax, N * sizeof(float)));

  std::cout << "\nRunning on GPUs = " << identicalGPUs.size() << std::endl;
  cudaStream_t *nStreams =
      (cudaStream_t *)malloc(sizeof(cudaStream_t) * identicalGPUs.size());

  void *kernelArgs[] = {
      (void *)&I,  (void *)&J, (void *)&val, (void *)&x,
      (void *)&Ax, (void *)&p, (void *)&r,   (void *)&dot_result,
      (void *)&nz, (void *)&N, (void *)&tol,
  };

  int sMemSize = sizeof(double) * THREADS_PER_BLOCK;
  int numBlocksPerSm = 0;
  int numThreads = THREADS_PER_BLOCK;

  deviceId = identicalGPUs.begin();
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaSetDevice(*deviceId));
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId));

  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
      &numBlocksPerSm, multiGpuConjugateGradient, numThreads, sMemSize));

  int numSms = deviceProp.multiProcessorCount;
  dim3 dimGrid(numSms * numBlocksPerSm, 1, 1),
      dimBlock(THREADS_PER_BLOCK, 1, 1);

  int device_count = 0;

  int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;

  while (deviceId != identicalGPUs.end()) {
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaSetDevice(*deviceId));
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId));
    checkCudaErrors(cudaStreamCreate(&nStreams[device_count]));

    if (deviceProp.concurrentManagedAccess) {
      int perGPUIter = N / (totalThreadsPerGPU * identicalGPUs.size());
      int offset_Ax = device_count * totalThreadsPerGPU;
      int offset_r = device_count * totalThreadsPerGPU;
      int offset_p = device_count * totalThreadsPerGPU;
      int offset_x = device_count * totalThreadsPerGPU;

      checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, *deviceId,
                                           nStreams[device_count]));
      checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, *deviceId,
                                           nStreams[device_count]));
      checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, *deviceId,
                                           nStreams[device_count]));

      if (offset_Ax <= N) {
        for (int i = 0; i < perGPUIter; i++) {
          cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetPreferredLocation, *deviceId);
          cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetPreferredLocation, *deviceId);
          cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetPreferredLocation, *deviceId);
          cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetPreferredLocation, *deviceId);

          cudaMemAdvise(Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetAccessedBy, *deviceId);
          cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetAccessedBy, *deviceId);
          cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetAccessedBy, *deviceId);
          cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU,
                        cudaMemAdviseSetAccessedBy, *deviceId);

          offset_Ax += totalThreadsPerGPU * identicalGPUs.size();
          offset_r += totalThreadsPerGPU * identicalGPUs.size();
          offset_p += totalThreadsPerGPU * identicalGPUs.size();
          offset_x += totalThreadsPerGPU * identicalGPUs.size();

          if (offset_Ax >= N) {
            break;
          }
        }
      }
    }
    device_count++;
    deviceId++;
  }

#if ENABLE_CPU_DEBUG_CODE
  float *Ax_cpu = (float *)malloc(sizeof(float) * N);
  float *r_cpu = (float *)malloc(sizeof(float) * N);
  float *p_cpu = (float *)malloc(sizeof(float) * N);
  float *x_cpu = (float *)malloc(sizeof(float) * N);

  for (int i = 0; i < N; i++) {
    r_cpu[i] = 1.0;
    Ax_cpu[i] = x_cpu[i] = 0.0;
  }
#endif

  printf("Total threads per GPU = %d numBlocksPerSm  = %d\n",
         numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm);
  cudaLaunchParams *launchParamsList = (cudaLaunchParams *)malloc(
      sizeof(cudaLaunchParams) * identicalGPUs.size());
  for (int i = 0; i < identicalGPUs.size(); i++) {
    launchParamsList[i].func = (void *)multiGpuConjugateGradient;
    launchParamsList[i].gridDim = dimGrid;
    launchParamsList[i].blockDim = dimBlock;
    launchParamsList[i].sharedMem = sMemSize;
    launchParamsList[i].stream = nStreams[i];
    launchParamsList[i].args = kernelArgs;
  }

  printf("Launching kernel\n");
  checkCudaErrors(cudaLaunchCooperativeKernelMultiDevice(
      launchParamsList, identicalGPUs.size(),
      cudaCooperativeLaunchMultiDeviceNoPreSync |
          cudaCooperativeLaunchMultiDeviceNoPostSync));

  if (deviceProp.concurrentManagedAccess) {
    checkCudaErrors(
        cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
    checkCudaErrors(
        cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
  }

  deviceId = identicalGPUs.begin();
  device_count = 0;
  while (deviceId != identicalGPUs.end()) {
    checkCudaErrors(cudaSetDevice(*deviceId));
    checkCudaErrors(cudaStreamSynchronize(nStreams[device_count++]));
    deviceId++;
  }

  r1 = *dot_result;

  printf("GPU Final, residual = %e \n  ", sqrt(r1));

#if ENABLE_CPU_DEBUG_CODE
  cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol);
#endif

  float rsum, diff, err = 0.0;

  for (int i = 0; i < N; i++) {
    rsum = 0.0;

    for (int j = I[i]; j < I[i + 1]; j++) {
      rsum += val_cpu[j] * x[J[j]];
    }

    diff = fabs(rsum - rhs);

    if (diff > err) {
      err = diff;
    }
  }

  checkCudaErrors(cudaFree(I));
  checkCudaErrors(cudaFree(J));
  checkCudaErrors(cudaFree(val));
  checkCudaErrors(cudaFree(x));
  checkCudaErrors(cudaFree(r));
  checkCudaErrors(cudaFree(p));
  checkCudaErrors(cudaFree(Ax));
  checkCudaErrors(cudaFree(dot_result));
  free(val_cpu);

#if ENABLE_CPU_DEBUG_CODE
  free(Ax_cpu);
  free(r_cpu);
  free(p_cpu);
  free(x_cpu);
#endif

  printf("Test Summary:  Error amount = %f \n", err);
  fprintf(stdout, "&&&& conjugateGradientMultiDeviceCG %s\n",
          (sqrt(r1) < tol) ? "PASSED" : "FAILED");
  exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// Shuffle intrinsics CUDA Sample
// This sample demonstrates the use of the shuffle intrinsic
// First, a simple example of a prefix sum using the shuffle to
// perform a scan operation is provided.
// Secondly, a more involved example of computing an integral image
// using the shuffle intrinsic is provided, where the shuffle
// scan operation and shuffle xor operations are used

#include <stdio.h>

#include <cuda_runtime.h>

#include <helper_functions.h>
#include <helper_cuda.h>
#include "shfl_integral_image.cuh"

// Scan using shfl - takes log2(n) steps
// This function demonstrates basic use of the shuffle intrinsic, __shfl_up,
// to perform a scan operation across a block.
// First, it performs a scan (prefix sum in this case) inside a warp
// Then to continue the scan operation across the block,
// each warp's sum is placed into shared memory.  A single warp
// then performs a shuffle scan on that shared memory.  The results
// are then uniformly added to each warp's threads.
// This pyramid type approach is continued by placing each block's
// final sum in global memory and prefix summing that via another kernel call, then
// uniformly adding across the input data via the uniform_add<<<>>> kernel.

__global__ void shfl_scan_test(int *data, int width, int *partial_sums=NULL)
{
    extern __shared__ int sums[];
    int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
    int lane_id = id % warpSize;
    // determine a warp_id within a block
    int warp_id = threadIdx.x / warpSize;

    // Below is the basic structure of using a shfl instruction
    // for a scan.
    // Record "value" as a variable - we accumulate it along the way
    int value = data[id];

    // Now accumulate in log steps up the chain
    // compute sums, with another thread's value who is
    // distance delta away (i).  Note
    // those threads where the thread 'i' away would have
    // been out of bounds of the warp are unaffected.  This
    // creates the scan sum.

#pragma unroll
    for (int i=1; i<=width; i*=2)
    {
        unsigned int mask = 0xffffffff;
        int n = __shfl_up_sync(mask, value, i, width);

        if (lane_id >= i) value += n;
    }

    // value now holds the scan value for the individual thread
    // next sum the largest values for each warp

    // write the sum of the warp to smem
    if (threadIdx.x % warpSize == warpSize-1)
    {
        sums[warp_id] = value;
    }

    __syncthreads();

    //
    // scan sum the warp sums
    // the same shfl scan operation, but performed on warp sums
    //
    if (warp_id == 0 && lane_id < (blockDim.x / warpSize))
    {
        int warp_sum = sums[lane_id];

        int mask = (1 << (blockDim.x / warpSize)) - 1;
        for (int i=1; i<=(blockDim.x / warpSize); i*=2)
        {
            int n = __shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize));

            if (lane_id >= i) warp_sum += n;
        }

        sums[lane_id] = warp_sum;
    }

    __syncthreads();

    // perform a uniform add across warps in the block
    // read neighbouring warp's sum and add it to threads value
    int blockSum = 0;

    if (warp_id > 0)
    {
        blockSum = sums[warp_id-1];
    }

    value += blockSum;

    // Now write out our result
    data[id] = value;

    // last thread has sum, write write out the block's sum
    if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
    {
        partial_sums[blockIdx.x] = value;
    }
}

// Uniform add: add partial sums array
__global__ void uniform_add(int *data, int *partial_sums, int len)
{
    __shared__ int buf;
    int id = ((blockIdx.x * blockDim.x) + threadIdx.x);

    if (id > len) return;

    if (threadIdx.x == 0)
    {
        buf = partial_sums[blockIdx.x];
    }

    __syncthreads();
    data[id] += buf;
}

static unsigned int iDivUp(unsigned int dividend, unsigned int divisor)
{
    return ((dividend % divisor) == 0) ?
           (dividend / divisor) :
           (dividend / divisor + 1);
}


// This function verifies the shuffle scan result, for the simple
// prefix sum case.
bool CPUverify(int *h_data, int *h_result, int n_elements)
{
    // cpu verify
    for (int i=0; i<n_elements-1; i++)
    {
        h_data[i+1] = h_data[i] + h_data[i+1];
    }

    int diff = 0;

    for (int i=0 ; i<n_elements; i++)
    {
        diff += h_data[i]-h_result[i];
    }

    printf("CPU verify result diff (GPUvsCPU) = %d\n", diff);
    bool bTestResult = false;

    if (diff == 0) bTestResult = true;

    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (int j=0; j<100; j++)
        for (int i=0; i<n_elements-1; i++)
        {
            h_data[i+1] = h_data[i] + h_data[i+1];
        }

    sdkStopTimer(&hTimer);
    double cput= sdkGetTimerValue(&hTimer);
    printf("CPU sum (naive) took %f ms\n", cput/100);
    return bTestResult;
}


// this verifies the row scan result for synthetic data of all 1's
unsigned int verifyDataRowSums(unsigned int *h_image, int w, int h)
{
    unsigned int diff = 0;

    for (int j=0; j<h; j++)
    {
        for (int i=0; i<w; i++)
        {
            int gold = i+1;
            diff += abs((int)gold-(int)h_image[j*w + i]);
        }
    }

    return diff;
}

bool shuffle_simple_test(int argc, char **argv)
{
    int *h_data, *h_partial_sums, *h_result;
    int *d_data, *d_partial_sums;
    const int n_elements = 65536;
    int sz = sizeof(int)*n_elements;
    int cuda_device = 0;

    printf("Starting shfl_scan\n");

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cuda_device = findCudaDevice(argc, (const char **)argv);

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDevice(&cuda_device));

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // __shfl intrinsic needs SM 3.0 or higher
    if (deviceProp.major < 3)
    {
        printf("> __shfl() intrinsic requires device SM 3.0+\n");
        printf("> Waiving test.\n");
        exit(EXIT_WAIVED);
    }

    checkCudaErrors(cudaMallocHost((void **)&h_data, sizeof(int)*n_elements));
    checkCudaErrors(cudaMallocHost((void **)&h_result, sizeof(int)*n_elements));

    //initialize data:
    printf("Computing Simple Sum test\n");
    printf("---------------------------------------------------\n");

    printf("Initialize test data [1, 1, 1...]\n");

    for (int i=0; i<n_elements; i++)
    {
        h_data[i] = 1;
    }

    int blockSize = 256;
    int gridSize = n_elements/blockSize;
    int nWarps = blockSize/32;
    int shmem_sz = nWarps * sizeof(int);
    int n_partialSums = n_elements/blockSize;
    int partial_sz = n_partialSums*sizeof(int);

    printf("Scan summation for %d elements, %d partial sums\n",
           n_elements, n_elements/blockSize);

    int p_blockSize = min(n_partialSums, blockSize);
    int p_gridSize = iDivUp(n_partialSums, p_blockSize);
    printf("Partial summing %d elements with %d blocks of size %d\n",
           n_partialSums, p_gridSize, p_blockSize);

    // initialize a timer
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));
    float et = 0;
    float inc = 0;

    checkCudaErrors(cudaMalloc((void **)&d_data, sz));
    checkCudaErrors(cudaMalloc((void **)&d_partial_sums, partial_sz));
    checkCudaErrors(cudaMemset(d_partial_sums, 0, partial_sz));

    checkCudaErrors(cudaMallocHost((void **)&h_partial_sums, partial_sz));
    checkCudaErrors(cudaMemcpy(d_data, h_data, sz, cudaMemcpyHostToDevice));

    checkCudaErrors(cudaEventRecord(start, 0));
    shfl_scan_test<<<gridSize,blockSize, shmem_sz>>>(d_data, 32, d_partial_sums);
    shfl_scan_test<<<p_gridSize,p_blockSize, shmem_sz>>>(d_partial_sums,32);
    uniform_add<<<gridSize-1, blockSize>>>(d_data+blockSize, d_partial_sums, n_elements);
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaEventSynchronize(stop));
    checkCudaErrors(cudaEventElapsedTime(&inc, start, stop));
    et+=inc;

    checkCudaErrors(cudaMemcpy(h_result, d_data, sz, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_partial_sums, d_partial_sums, partial_sz,
                               cudaMemcpyDeviceToHost));

    printf("Test Sum: %d\n", h_partial_sums[n_partialSums-1]);
    printf("Time (ms): %f\n", et);
    printf("%d elements scanned in %f ms -> %f MegaElements/s\n",
           n_elements, et, n_elements/(et/1000.0f)/1000000.0f);

    bool bTestResult = CPUverify(h_data, h_result, n_elements);

    checkCudaErrors(cudaFreeHost(h_data));
    checkCudaErrors(cudaFreeHost(h_result));
    checkCudaErrors(cudaFreeHost(h_partial_sums));
    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFree(d_partial_sums));

    return bTestResult;
}

// This function tests creation of an integral image using
// synthetic data, of size 1920x1080 pixels greyscale.
bool shuffle_integral_image_test()
{
    char *d_data;
    unsigned int *h_image;
    unsigned int *d_integral_image;
    int w = 1920;
    int h = 1080;
    int n_elements = w*h;
    int sz = sizeof(unsigned int)*n_elements;

    printf("\nComputing Integral Image Test on size %d x %d synthetic data\n", w, h);
    printf("---------------------------------------------------\n");
    checkCudaErrors(cudaMallocHost((void **)&h_image, sz));
    // fill test "image" with synthetic 1's data
    memset(h_image, 0, sz);

    // each thread handles 16 values, use 1 block/row
    int blockSize = iDivUp(w,16);
    // launch 1 block / row
    int gridSize = h;

    // Create a synthetic image for testing
    checkCudaErrors(cudaMalloc((void **)&d_data, sz));
    checkCudaErrors(cudaMalloc((void **)&d_integral_image, n_elements*sizeof(int)*4));
    checkCudaErrors(cudaMemset(d_data, 1, sz));
    checkCudaErrors(cudaMemset(d_integral_image, 0, sz));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float et = 0;
    unsigned int err;


    // Execute scan line prefix sum kernel, and time it
    cudaEventRecord(start);
    shfl_intimage_rows<<<gridSize,blockSize>>>((uint4 *)d_data, (uint4 *)d_integral_image);
    cudaEventRecord(stop);
    checkCudaErrors(cudaEventSynchronize(stop));
    checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
    printf("Method: Fast  Time (GPU Timer): %f ms ", et);

    // verify the scan line results
    checkCudaErrors(cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost));
    err = verifyDataRowSums(h_image, w, h);
    printf("Diff = %d\n", err);

    // Execute column prefix sum kernel and time it
    dim3 blockSz(32, 8);
    dim3 testGrid(w/blockSz.x, 1);

    cudaEventRecord(start);
    shfl_vertical_shfl<<<testGrid,blockSz>>>((unsigned int *)d_integral_image, w, h);
    cudaEventRecord(stop);
    checkCudaErrors(cudaEventSynchronize(stop));
    checkCudaErrors(cudaEventElapsedTime(&et, start, stop));
    printf("Method: Vertical Scan  Time (GPU Timer): %f ms ", et);

    // Verify the column results
    checkCudaErrors(cudaMemcpy(h_image, d_integral_image, sz, cudaMemcpyDeviceToHost));
    printf("\n");

    int finalSum = h_image[w*h-1];
    printf("CheckSum: %d, (expect %dx%d=%d)\n", finalSum, w,h, w *h);

    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFree(d_integral_image));
    checkCudaErrors(cudaFreeHost(h_image));
    // verify final sum: if the final value in the corner is the same as the size of the
    // buffer (all 1's) then the integral image was generated successfully
    return (finalSum == w*h)? true : false;
}

int main(int argc, char *argv[])
{
    // Initialization.  The shuffle intrinsic is not available on SM < 3.0
    // so waive the test if the hardware is not present.
    int cuda_device = 0;

    printf("Starting shfl_scan\n");

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cuda_device = findCudaDevice(argc, (const char **)argv);

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDevice(&cuda_device));

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // __shfl intrinsic needs SM 3.0 or higher
    if (deviceProp.major < 3)
    {
        printf("> __shfl() intrinsic requires device SM 3.0+\n");
        printf("> Waiving test.\n");
        exit(EXIT_WAIVED);
    }


    bool bTestResult = true;
    bool simpleTest = shuffle_simple_test(argc, argv);
    bool intTest = shuffle_integral_image_test();

    bTestResult = simpleTest & intTest;

    exit((bTestResult) ? EXIT_SUCCESS : EXIT_FAILURE);
}
// Utility function to extract unsigned chars from an
// unsigned integer

__device__ uchar4 int_to_uchar4(unsigned int in)
{
    uchar4 bytes;
    bytes.x = in & 0x000000ff >> 0;
    bytes.y = in & 0x0000ff00 >> 8;
    bytes.z = in & 0x00ff0000 >>16;
    bytes.w = in & 0xff000000 >>24;
    return bytes;
}


// This function demonstrates some uses of the shuffle instruction
// in the generation of an integral image (also
// called a summed area table)
// The approach is two pass, a horizontal (scanline) then a vertical
// (column) pass.
// This is the horizontal pass kernel.
__global__ void shfl_intimage_rows(uint4 *img, uint4 *integral_image)
{
    __shared__ int sums[128];

    int id = threadIdx.x;
    // pointer to head of current scanline
    uint4 *scanline = &img[ blockIdx.x*120 ];
    uint4 data;
    data = scanline[id];
    int result[16];
    int sum;
    unsigned int lane_id = id % warpSize;
    int warp_id = threadIdx.x / warpSize;

    uchar4 a = int_to_uchar4(data.x);
    uchar4 b = int_to_uchar4(data.y);
    uchar4 c = int_to_uchar4(data.z);
    uchar4 d = int_to_uchar4(data.w);

    result[0] = a.x;
    result[1] = a.x + a.y;
    result[2] = a.x + a.y + a.z;
    result[3] = a.x + a.y + a.z + a.w;

    result[4] = b.x;
    result[5] = b.x + b.y;
    result[6] = b.x + b.y + b.z;
    result[7] = b.x + b.y + b.z + b.w;

    result[8]  = c.x;
    result[9]  = c.x + c.y;
    result[10] = c.x + c.y + c.z;
    result[11] = c.x + c.y + c.z + c.w;

    result[12] = d.x;
    result[13] = d.x + d.y;
    result[14] = d.x + d.y + d.z;
    result[15] = d.x + d.y + d.z + d.w;

#pragma unroll

    for (int i=4;  i<=7  ; i++) result[i] += result[3];

#pragma unroll

    for (int i=8;  i<=11 ; i++) result[i] += result[7];

#pragma unroll

    for (int i=12; i<=15 ; i++) result[i] += result[11];

    sum = result[15];

    // the prefix sum for each thread's 16 value is computed,
    // now the final sums (result[15]) need to be shared
    // with the other threads and add.  To do this,
    // the __shfl_up() instruction is used and a shuffle scan
    // operation is performed to distribute the sums to the correct
    // threads
#pragma unroll

    for (int i=1 ; i<32 ; i*=2)
    {
        unsigned int mask = 0xffffffff;
        int n = __shfl_up_sync(mask, sum, i, 32);

        if (lane_id >= i)
        {
#pragma unroll

            for (int i=0 ; i<16; i++)
            {
                result[i] += n;
            }

            sum += n;
        }
    }

    // Now the final sum for the warp must be shared
    // between warps.  This is done by each warp
    // having a thread store to shared memory, then
    // having some other warp load the values and
    // compute a prefix sum, again by using __shfl_up.
    // The results are uniformly added back to the warps.
    // last thread in the warp holding sum of the warp
    // places that in shared
    if (threadIdx.x % warpSize == warpSize-1)
    {
        sums[warp_id] = result[15];
    }

    __syncthreads();

    if (warp_id == 0)
    {
        int warp_sum = sums[lane_id];
#pragma unroll

        for (int i=1; i<=32; i*=2)
        {
            unsigned int mask = 0xffffffff;
            int n = __shfl_up_sync(mask, warp_sum, i, 32);

            if (lane_id >= i) warp_sum += n;
        }

        sums[lane_id] = warp_sum;
    }

    __syncthreads();


    int blockSum = 0;

    // fold in unused warp
    if (warp_id >0)
    {
        blockSum = sums[warp_id-1];
#pragma unroll

        for (int i=0; i<16; i++)
        {
            result[i] += blockSum;
        }
    }

    // assemble result
    // Each thread has 16 values to write, which are
    // now integer data (to avoid overflow).  Instead of
    // each thread writing consecutive uint4s, the
    // approach shown here experiments using
    // the shuffle command to reformat the data
    // inside the registers so that each thread holds
    // consecutive data to be written so larger contiguous
    // segments can be assembled for writing.
    /*
        For example data that needs to be written as

        GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
        but is stored in registers (r0..r3), in four threads (0..3) as:

        threadId   0  1  2  3
          r0      x0 y0 z0 w0
          r1      x1 y1 z1 w1
          r2      x2 y2 z2 w2
          r3      x3 y3 z3 w3

          after apply __shfl_xor operations to move data between registers r1..r3:

        threadId  00 01 10 11
                  x0 y0 z0 w0
         xor(01)->y1 x1 w1 z1
         xor(10)->z2 w2 x2 y2
         xor(11)->w3 z3 y3 x3

         and now x0..x3, and z0..z3 can be written out in order by all threads.

         In the current code, each register above is actually representing
         four integers to be written as uint4's to GMEM.
    */

    unsigned int mask = 0xffffffff;
    uint4 output;
    result[4] = __shfl_xor_sync(mask, result[4], 1, 32);
    result[5] = __shfl_xor_sync(mask, result[5], 1, 32);
    result[6] = __shfl_xor_sync(mask, result[6], 1, 32);
    result[7] = __shfl_xor_sync(mask, result[7], 1, 32);

    result[8] = __shfl_xor_sync(mask, result[8], 2, 32);
    result[9] = __shfl_xor_sync(mask, result[9], 2, 32);
    result[10] = __shfl_xor_sync(mask, result[10], 2, 32);
    result[11] = __shfl_xor_sync(mask, result[11], 2, 32);

    result[12] = __shfl_xor_sync(mask, result[12], 3, 32);
    result[13] = __shfl_xor_sync(mask, result[13], 3, 32);
    result[14] = __shfl_xor_sync(mask, result[14], 3, 32);
    result[15] = __shfl_xor_sync(mask, result[15], 3, 32);

    if (threadIdx.x % 4 == 0)
    {
        output = make_uint4(result[0], result[1], result[2], result[3]);
    }

    if (threadIdx.x % 4 == 1)
    {
        output = make_uint4(result[4], result[5], result[6], result[7]);
    }

    if (threadIdx.x % 4 == 2)
    {
        output = make_uint4(result[8], result[9], result[10], result[11]);
    }

    if (threadIdx.x % 4 == 3)
    {
        output = make_uint4(result[12], result[13], result[14], result[15]);
    }

    integral_image[ blockIdx.x*480 + threadIdx.x%4 + (threadIdx.x/4)*16]
        = output;


    if (threadIdx.x % 4 == 2)
    {
        output = make_uint4(result[0], result[1], result[2], result[3]);
    }

    if (threadIdx.x % 4 == 3)
    {
        output = make_uint4(result[4], result[5], result[6], result[7]);
    }

    if (threadIdx.x % 4 == 0)
    {
        output = make_uint4(result[8], result[9], result[10], result[11]);
    }

    if (threadIdx.x % 4 == 1)
    {
        output = make_uint4(result[12], result[13], result[14], result[15]);
    }

    integral_image[ blockIdx.x*480 + (threadIdx.x+2)%4 + (threadIdx.x/4)*16 + 8]
        = output;
    // continuing from the above example,
    // this use of __shfl_xor() places the y0..y3 and w0..w3 data
    // in order.
#pragma unroll

    for (int i=0; i<16; i++)
    {
        result[i] = __shfl_xor_sync(mask, result[i],1, 32);
    }

    if (threadIdx.x % 4 == 0)
    {
        output = make_uint4(result[0], result[1], result[2], result[3]);
    }

    if (threadIdx.x % 4 == 1)
    {
        output = make_uint4(result[4], result[5], result[6], result[7]);
    }

    if (threadIdx.x % 4 == 2)
    {
        output = make_uint4(result[8], result[9], result[10], result[11]);
    }

    if (threadIdx.x % 4 == 3)
    {
        output = make_uint4(result[12], result[13], result[14], result[15]);
    }

    integral_image[ blockIdx.x*480 + threadIdx.x%4 + (threadIdx.x/4)*16 + 4]
        = output;


    if (threadIdx.x % 4 == 2)
    {
        output = make_uint4(result[0], result[1], result[2], result[3]);
    }

    if (threadIdx.x % 4 == 3)
    {
        output = make_uint4(result[4], result[5], result[6], result[7]);
    }

    if (threadIdx.x % 4 == 0)
    {
        output = make_uint4(result[8], result[9], result[10], result[11]);
    }

    if (threadIdx.x % 4 == 1)
    {
        output = make_uint4(result[12], result[13], result[14], result[15]);
    }

    integral_image[ blockIdx.x*480 + (threadIdx.x+2)%4 + (threadIdx.x/4)*16 + 12]
        = output;
}

// This kernel computes columnwise prefix sums.  When the data input is
// the row sums from above, this completes the integral image.
// The approach here is to have each block compute a local set of sums.
// First , the data covered by the block is loaded into shared memory,
// then instead of performing a sum in shared memory using __syncthreads
// between stages, the data is reformatted so that the necessary sums
// occur inside warps and the shuffle scan operation is used.
// The final set of sums from the block is then propagated, with the block
// computing "down" the image and adding the running sum to the local
// block sums.
__global__ void shfl_vertical_shfl(unsigned int *img, int width, int height)
{
    __shared__ unsigned int sums[32][9];
    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
    //int warp_id = threadIdx.x / warpSize ;
    unsigned int lane_id = tidx % 8;
    //int rows_per_thread = (height / blockDim. y) ;
    //int start_row = rows_per_thread * threadIdx.y;
    unsigned int stepSum = 0;
    unsigned int mask = 0xffffffff;

    sums[threadIdx.x][threadIdx.y] = 0;
    __syncthreads();

    for (int step = 0 ; step < 135 ; step++)
    {
        unsigned int sum = 0;
        unsigned int *p = img + (threadIdx.y+step*8)*width + tidx;

        sum = *p;
        sums[threadIdx.x][threadIdx.y] = sum;
        __syncthreads();

        // place into SMEM
        // shfl scan reduce the SMEM, reformating so the column
        // sums are computed in a warp
        // then read out properly
        int partial_sum = 0;
        int j = threadIdx.x %8;
        int k = threadIdx.x/8 + threadIdx.y*4;

        partial_sum = sums[k][j];

        for (int i=1 ; i<=8 ; i*=2)
        {
            int n = __shfl_up_sync(mask, partial_sum, i, 32);

            if (lane_id >= i) partial_sum += n;
        }

        sums[k][j] = partial_sum;
        __syncthreads();

        if (threadIdx.y > 0)
        {
            sum += sums[threadIdx.x][threadIdx.y-1];
        }

        sum += stepSum;
        stepSum += sums[threadIdx.x][blockDim.y-1];
        __syncthreads();
        *p = sum ;
    }

}


/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
    Parallel reduction kernels
*/

#ifndef _REDUCE_KERNEL_H_
#define _REDUCE_KERNEL_H_

#include <stdio.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory
{
    __device__ inline operator       T *()
    {
        extern __shared__ int __smem[];
        return (T *)__smem;
    }

    __device__ inline operator const T *() const
    {
        extern __shared__ int __smem[];
        return (T *)__smem;
    }
};

// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double>
{
    __device__ inline operator       double *()
    {
        extern __shared__ double __smem_d[];
        return (double *)__smem_d;
    }

    __device__ inline operator const double *() const
    {
        extern __shared__ double __smem_d[];
        return (double *)__smem_d;
    }
};

/*
    Parallel sum reduction using shared memory
    - takes log(n) steps for n input elements
    - uses n threads
    - only works for power-of-2 arrays
*/

/* This reduction interleaves which threads are active by using the modulo
   operator.  This operator is very expensive on GPUs, and the interleaved
   inactivity means that no whole warps are active, which is also very
   inefficient */
template <class T>
__global__ void
reduce0(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // load shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? g_idata[i] : 0;

    cg::sync(cta);

    // do reduction in shared mem
    for (unsigned int s=1; s < blockDim.x; s *= 2)
    {
        // modulo arithmetic is slow!
        if ((tid % (2*s)) == 0)
        {
            sdata[tid] += sdata[tid + s];
        }

        cg::sync(cta);
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/* This version uses contiguous threads, but its interleaved
   addressing results in many shared memory bank conflicts.
*/
template <class T>
__global__ void
reduce1(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // load shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? g_idata[i] : 0;

    cg::sync(cta);

    // do reduction in shared mem
    for (unsigned int s=1; s < blockDim.x; s *= 2)
    {
        int index = 2 * s * tid;

        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }

        cg::sync(cta);
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/*
    This version uses sequential addressing -- no divergence or bank conflicts.
*/
template <class T>
__global__ void
reduce2(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // load shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? g_idata[i] : 0;

    cg::sync(cta);

    // do reduction in shared mem
    for (unsigned int s=blockDim.x/2; s>0; s>>=1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }

        cg::sync(cta);
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/*
    This version uses n/2 threads --
    it performs the first level of reduction when reading from global memory.
*/
template <class T>
__global__ void
reduce3(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;

    T mySum = (i < n) ? g_idata[i] : 0;

    if (i + blockDim.x < n)
        mySum += g_idata[i+blockDim.x];

    sdata[tid] = mySum;
    cg::sync(cta);

    // do reduction in shared mem
    for (unsigned int s=blockDim.x/2; s>0; s>>=1)
    {
        if (tid < s)
        {
            sdata[tid] = mySum = mySum + sdata[tid + s];
        }

        cg::sync(cta);
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

/*
    This version uses the warp shuffle operation if available to reduce
    warp synchronization. When shuffle is not available the final warp's
    worth of work is unrolled to reduce looping overhead.

    See http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
    for additional information about using shuffle to perform a reduction
    within a warp.

    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
*/
template <class T, unsigned int blockSize>
__global__ void
reduce4(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;

    T mySum = (i < n) ? g_idata[i] : 0;

    if (i + blockSize < n)
        mySum += g_idata[i+blockSize];

    sdata[tid] = mySum;
    cg::sync(cta);

    // do reduction in shared mem
    for (unsigned int s=blockDim.x/2; s>32; s>>=1)
    {
        if (tid < s)
        {
            sdata[tid] = mySum = mySum + sdata[tid + s];
        }

        cg::sync(cta);
    }

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    if (cta.thread_rank() < 32)
    {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >=  64) mySum += sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = tile32.size()/2; offset > 0; offset /= 2)
        {
             mySum += tile32.shfl_down(mySum, offset);
        }
    }

    // write result for this block to global mem
    if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
}

/*
    This version is completely unrolled, unless warp shuffle is available, then
    shuffle is used within a loop.  It uses a template parameter to achieve
    optimal code for any (power of 2) number of threads.  This requires a switch
    statement in the host code to handle all the different thread block sizes at
    compile time. When shuffle is available, it is used to reduce warp synchronization.

    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
*/
template <class T, unsigned int blockSize>
__global__ void
reduce5(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;

    T mySum = (i < n) ? g_idata[i] : 0;

    if (i + blockSize < n)
        mySum += g_idata[i+blockSize];

    sdata[tid] = mySum;
    cg::sync(cta);

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    cg::sync(cta);

    if ((blockSize >= 256) &&(tid < 128))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

    cg::sync(cta);

    if ((blockSize >= 128) && (tid <  64))
    {
       sdata[tid] = mySum = mySum + sdata[tid +  64];
    }

    cg::sync(cta);

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    if (cta.thread_rank() < 32)
    {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >=  64) mySum += sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = tile32.size()/2; offset > 0; offset /= 2)
        {
             mySum += tile32.shfl_down(mySum, offset);
        }
    }

    // write result for this block to global mem
    if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
}

/*
    This version adds multiple elements per thread sequentially.  This reduces the overall
    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
    (Brent's Theorem optimization)

    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
*/
template <class T, unsigned int blockSize, bool nIsPow2>
__global__ void
reduce6(T *g_idata, T *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    T *sdata = SharedMemory<T>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
    unsigned int gridSize = blockSize*2*gridDim.x;

    T mySum = 0;

    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n)
    {
        mySum += g_idata[i];

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n)
            mySum += g_idata[i+blockSize];

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = mySum;
    cg::sync(cta);


    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    cg::sync(cta);

    if ((blockSize >= 256) &&(tid < 128))
    {
            sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

    cg::sync(cta);

    if ((blockSize >= 128) && (tid <  64))
    {
       sdata[tid] = mySum = mySum + sdata[tid +  64];
    }

    cg::sync(cta);

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    if (cta.thread_rank() < 32)
    {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >=  64) mySum += sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = tile32.size()/2; offset > 0; offset /= 2)
        {
             mySum += tile32.shfl_down(mySum, offset);
        }
    }

    // write result for this block to global mem
    if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
}


extern "C"
bool isPow2(unsigned int x);


////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
template <class T>
void
reduce(int size, int threads, int blocks,
       int whichKernel, T *d_idata, T *d_odata)
{
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);

    // choose which of the optimized versions of reduction to launch
    switch (whichKernel)
    {
        case 0:
            reduce0<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
            break;

        case 1:
            reduce1<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
            break;

        case 2:
            reduce2<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
            break;

        case 3:
            reduce3<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
            break;

        case 4:
            switch (threads)
            {
                case 512:
                    reduce4<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 256:
                    reduce4<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 128:
                    reduce4<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 64:
                    reduce4<T,  64><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 32:
                    reduce4<T,  32><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 16:
                    reduce4<T,  16><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  8:
                    reduce4<T,   8><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  4:
                    reduce4<T,   4><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  2:
                    reduce4<T,   2><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  1:
                    reduce4<T,   1><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;
            }

            break;

        case 5:
            switch (threads)
            {
                case 512:
                    reduce5<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 256:
                    reduce5<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 128:
                    reduce5<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 64:
                    reduce5<T,  64><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 32:
                    reduce5<T,  32><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case 16:
                    reduce5<T,  16><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  8:
                    reduce5<T,   8><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  4:
                    reduce5<T,   4><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  2:
                    reduce5<T,   2><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;

                case  1:
                    reduce5<T,   1><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                    break;
            }

            break;

        case 6:
        default:
            if (isPow2(size))
            {
                switch (threads)
                {
                    case 512:
                        reduce6<T, 512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 256:
                        reduce6<T, 256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 128:
                        reduce6<T, 128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 64:
                        reduce6<T,  64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 32:
                        reduce6<T,  32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 16:
                        reduce6<T,  16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  8:
                        reduce6<T,   8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  4:
                        reduce6<T,   4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  2:
                        reduce6<T,   2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  1:
                        reduce6<T,   1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;
                }
            }
            else
            {
                switch (threads)
                {
                    case 512:
                        reduce6<T, 512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 256:
                        reduce6<T, 256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 128:
                        reduce6<T, 128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 64:
                        reduce6<T,  64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 32:
                        reduce6<T,  32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case 16:
                        reduce6<T,  16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  8:
                        reduce6<T,   8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  4:
                        reduce6<T,   4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  2:
                        reduce6<T,   2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;

                    case  1:
                        reduce6<T,   1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                        break;
                }
            }

            break;
    }
}

// Instantiate the reduction function for 3 types
template void
reduce<int>(int size, int threads, int blocks,
            int whichKernel, int *d_idata, int *d_odata);

template void
reduce<float>(int size, int threads, int blocks,
              int whichKernel, float *d_idata, float *d_odata);

template void
reduce<double>(int size, int threads, int blocks,
               int whichKernel, double *d_idata, double *d_odata);


#endif // #ifndef _REDUCE_KERNEL_H_
/**
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <stdio.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>

#include <cuda_runtime.h>

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#define NUM_ELEMS 10000000
#define NUM_THREADS_PER_BLOCK 512

// warp-aggregated atomic increment
__device__ int atomicAggInc(int *counter) {
  cg::coalesced_group active = cg::coalesced_threads();

  int mask = active.ballot(1);
  // select the leader
  int leader = __ffs(mask) - 1;

  // leader does the update
  int res = 0;
  if (active.thread_rank() == leader) {
    res = atomicAdd(counter, __popc(mask));
  }

  // broadcast result
  res = active.shfl(res, leader);

  // each thread computes its own value
  return res + __popc(mask & ((1 << active.thread_rank()) - 1));
}

__global__ void filter_arr(int *dst, int *nres, const int *src, int n) {
  int id = threadIdx.x + blockIdx.x * blockDim.x;

  for (int i = id; i < n; i += gridDim.x * blockDim.x) {
    if (src[i] > 0) dst[atomicAggInc(nres)] = src[i];
  }
}

int main(int argc, char **argv) {
  int *data_to_filter, *filtered_data, nres = 0;
  int *d_data_to_filter, *d_filtered_data, *d_nres;

  data_to_filter = reinterpret_cast<int *>(malloc(sizeof(int) * NUM_ELEMS));

  // Generate input data.
  for (int i = 0; i < NUM_ELEMS; i++) {
    data_to_filter[i] = rand() % 20;
  }

  findCudaDevice(argc, (const char **)argv);

  checkCudaErrors(cudaMalloc(&d_data_to_filter, sizeof(int) * NUM_ELEMS));
  checkCudaErrors(cudaMalloc(&d_filtered_data, sizeof(int) * NUM_ELEMS));
  checkCudaErrors(cudaMalloc(&d_nres, sizeof(int)));

  checkCudaErrors(cudaMemcpy(d_data_to_filter, data_to_filter,
                             sizeof(int) * NUM_ELEMS, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemset(d_nres, 0, sizeof(int)));

  dim3 dimBlock(NUM_THREADS_PER_BLOCK, 1, 1);
  dim3 dimGrid((NUM_ELEMS / NUM_THREADS_PER_BLOCK) + 1, 1, 1);

  filter_arr<<<dimGrid, dimBlock>>>(d_filtered_data, d_nres, d_data_to_filter,
                                    NUM_ELEMS);

  checkCudaErrors(
      cudaMemcpy(&nres, d_nres, sizeof(int), cudaMemcpyDeviceToHost));

  filtered_data = reinterpret_cast<int *>(malloc(sizeof(int) * nres));

  checkCudaErrors(cudaMemcpy(filtered_data, d_filtered_data, sizeof(int) * nres,
                             cudaMemcpyDeviceToHost));

  int *host_filtered_data =
      reinterpret_cast<int *>(malloc(sizeof(int) * NUM_ELEMS));

  // Generate host output with host filtering code.
  int host_flt_count = 0;
  for (int i = 0; i < NUM_ELEMS; i++) {
    if (data_to_filter[i] > 0) {
      host_filtered_data[host_flt_count++] = data_to_filter[i];
    }
  }

  printf("\nWarp Aggregated Atomics %s \n",
         host_flt_count == nres ? "PASSED" : "FAILED");

  checkCudaErrors(cudaFree(d_data_to_filter));
  checkCudaErrors(cudaFree(d_filtered_data));
  checkCudaErrors(cudaFree(d_nres));
  free(data_to_filter);
  free(filtered_data);
  free(host_filtered_data);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


#ifndef FWT_KERNEL_CUH
#define FWT_KERNEL_CUH
#ifndef fwt_kernel_cuh
#define fwt_kernel_cuh

#include <cooperative_groups.h>

namespace cg = cooperative_groups;


///////////////////////////////////////////////////////////////////////////////
// Elementary(for vectors less than elementary size) in-shared memory
// combined radix-2 + radix-4 Fast Walsh Transform
///////////////////////////////////////////////////////////////////////////////
#define ELEMENTARY_LOG2SIZE 11

__global__ void fwtBatch1Kernel(float *d_Output, float *d_Input, int log2N)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    const int    N = 1 << log2N;
    const int base = blockIdx.x << log2N;

    //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
    extern __shared__ float s_data[];
    float *d_Src = d_Input  + base;
    float *d_Dst = d_Output + base;

    for (int pos = threadIdx.x; pos < N; pos += blockDim.x)
    {
        s_data[pos] = d_Src[pos];
    }

    //Main radix-4 stages
    const int pos = threadIdx.x;

    for (int stride = N >> 2; stride > 0; stride >>= 2)
    {
        int lo = pos & (stride - 1);
        int i0 = ((pos - lo) << 2) + lo;
        int i1 = i0 + stride;
        int i2 = i1 + stride;
        int i3 = i2 + stride;

        cg::sync(cta);
        float D0 = s_data[i0];
        float D1 = s_data[i1];
        float D2 = s_data[i2];
        float D3 = s_data[i3];

        float T;
        T = D0;
        D0         = D0 + D2;
        D2         = T - D2;
        T = D1;
        D1         = D1 + D3;
        D3         = T - D3;
        T = D0;
        s_data[i0] = D0 + D1;
        s_data[i1] = T - D1;
        T = D2;
        s_data[i2] = D2 + D3;
        s_data[i3] = T - D3;
    }

    //Do single radix-2 stage for odd power of two
    if (log2N & 1)
    {
        cg::sync(cta);

        for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x)
        {
            int i0 = pos << 1;
            int i1 = i0 + 1;

            float D0 = s_data[i0];
            float D1 = s_data[i1];
            s_data[i0] = D0 + D1;
            s_data[i1] = D0 - D1;
        }
    }

    cg::sync(cta);

    for (int pos = threadIdx.x; pos < N; pos += blockDim.x)
    {
        d_Dst[pos] = s_data[pos];
    }
}

////////////////////////////////////////////////////////////////////////////////
// Single in-global memory radix-4 Fast Walsh Transform pass
// (for strides exceeding elementary vector size)
////////////////////////////////////////////////////////////////////////////////
__global__ void fwtBatch2Kernel(
    float *d_Output,
    float *d_Input,
    int stride
)
{
    const int pos = blockIdx.x * blockDim.x + threadIdx.x;
    const int   N = blockDim.x *  gridDim.x * 4;

    float *d_Src = d_Input  + blockIdx.y * N;
    float *d_Dst = d_Output + blockIdx.y * N;

    int lo = pos & (stride - 1);
    int i0 = ((pos - lo) << 2) + lo;
    int i1 = i0 + stride;
    int i2 = i1 + stride;
    int i3 = i2 + stride;

    float D0 = d_Src[i0];
    float D1 = d_Src[i1];
    float D2 = d_Src[i2];
    float D3 = d_Src[i3];

    float T;
    T = D0;
    D0        = D0 + D2;
    D2        = T - D2;
    T = D1;
    D1        = D1 + D3;
    D3        = T - D3;
    T = D0;
    d_Dst[i0] = D0 + D1;
    d_Dst[i1] = T - D1;
    T = D2;
    d_Dst[i2] = D2 + D3;
    d_Dst[i3] = T - D3;
}

////////////////////////////////////////////////////////////////////////////////
// Put everything together: batched Fast Walsh Transform CPU front-end
////////////////////////////////////////////////////////////////////////////////
void fwtBatchGPU(float *d_Data, int M, int log2N)
{
    const int THREAD_N = 256;

    int N = 1 << log2N;
    dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);

    for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2)
    {
        fwtBatch2Kernel<<<grid, THREAD_N>>>(d_Data, d_Data, N / 4);
        getLastCudaError("fwtBatch2Kernel() execution failed\n");
    }

    fwtBatch1Kernel<<<M, N / 4, N *sizeof(float)>>>(
        d_Data,
        d_Data,
        log2N
    );
    getLastCudaError("fwtBatch1Kernel() execution failed\n");
}


////////////////////////////////////////////////////////////////////////////////
// Modulate two arrays
////////////////////////////////////////////////////////////////////////////////
__global__ void modulateKernel(float *d_A, float *d_B, int N)
{
    int        tid = blockIdx.x * blockDim.x + threadIdx.x;
    int numThreads = blockDim.x * gridDim.x;
    float     rcpN = 1.0f / (float)N;

    for (int pos = tid; pos < N; pos += numThreads)
    {
        d_A[pos] *= d_B[pos] * rcpN;
    }
}

//Interface to modulateKernel()
void modulateGPU(float *d_A, float *d_B, int N)
{
    modulateKernel<<<128, 256>>>(d_A, d_B, N);
}


#endif
#endif
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Walsh transforms belong to a class of generalized Fourier transformations.
 * They have applications in various fields of electrical engineering
 * and numeric theory. In this sample we demonstrate efficient implementation
 * of naturally-ordered Walsh transform
 * (also known as Walsh-Hadamard or Hadamard transform) in CUDA and its
 * particular application to dyadic convolution computation.
 * Refer to excellent Jorg Arndt's "Algorithms for Programmers" textbook
 * http://www.jjj.de/fxt/fxtbook.pdf (Chapter 22)
 *
 * Victor Podlozhnyuk (vpodlozhnyuk@nvidia.com)
 */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <helper_functions.h>
#include <helper_cuda.h>


////////////////////////////////////////////////////////////////////////////////
// Reference CPU FWT
////////////////////////////////////////////////////////////////////////////////
extern"C" void fwtCPU(float *h_Output, float *h_Input, int log2N);
extern"C" void slowWTcpu(float *h_Output, float *h_Input, int log2N);
extern "C" void dyadicConvolutionCPU(
    float *h_Result,
    float *h_Data,
    float *h_Kernel,
    int log2dataN,
    int log2kernelN
);


////////////////////////////////////////////////////////////////////////////////
// GPU FWT
////////////////////////////////////////////////////////////////////////////////
#include "fastWalshTransform_kernel.cuh"


////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
const int log2Kernel = 7;
const   int log2Data = 23;

const int   dataN = 1 << log2Data;
const int kernelN = 1 << log2Kernel;

const int   DATA_SIZE = dataN   * sizeof(float);
const int KERNEL_SIZE = kernelN * sizeof(float);

const double NOPS = 3.0 * (double)dataN * (double)log2Data / 2.0;


////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
    float *h_Data,
          *h_Kernel,
          *h_ResultCPU,
          *h_ResultGPU;

    float *d_Data,
          *d_Kernel;

    double delta, ref, sum_delta2, sum_ref2, L2norm, gpuTime;

    StopWatchInterface *hTimer = NULL;
    int i;

    printf("%s Starting...\n\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    printf("...allocating CPU memory\n");
    h_Kernel    = (float *)malloc(KERNEL_SIZE);
    h_Data      = (float *)malloc(DATA_SIZE);
    h_ResultCPU = (float *)malloc(DATA_SIZE);
    h_ResultGPU = (float *)malloc(DATA_SIZE);
    printf("...allocating GPU memory\n");
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, DATA_SIZE));
    checkCudaErrors(cudaMalloc((void **)&d_Data,   DATA_SIZE));

    printf("...generating data\n");
    printf("Data length: %i; kernel length: %i\n", dataN, kernelN);
    srand(2007);

    for (i = 0; i < kernelN; i++)
    {
        h_Kernel[i] = (float)rand() / (float)RAND_MAX;
    }

    for (i = 0; i < dataN; i++)
    {
        h_Data[i] = (float)rand() / (float)RAND_MAX;
    }

    checkCudaErrors(cudaMemset(d_Kernel, 0, DATA_SIZE));
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,     DATA_SIZE, cudaMemcpyHostToDevice));

    printf("Running GPU dyadic convolution using Fast Walsh Transform...\n");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    fwtBatchGPU(d_Data, 1, log2Data);
    fwtBatchGPU(d_Kernel, 1, log2Data);
    modulateGPU(d_Data, d_Kernel, dataN);
    fwtBatchGPU(d_Data, 1, log2Data);
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer);
    printf("GPU time: %f ms; GOP/s: %f\n", gpuTime, NOPS / (gpuTime * 0.001 * 1E+9));

    printf("Reading back GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_Data, DATA_SIZE, cudaMemcpyDeviceToHost));

    printf("Running straightforward CPU dyadic convolution...\n");
    dyadicConvolutionCPU(h_ResultCPU, h_Data, h_Kernel, log2Data, log2Kernel);

    printf("Comparing the results...\n");
    sum_delta2 = 0;
    sum_ref2   = 0;

    for (i = 0; i < dataN; i++)
    {
        delta       = h_ResultCPU[i] - h_ResultGPU[i];
        ref         = h_ResultCPU[i];
        sum_delta2 += delta * delta;
        sum_ref2   += ref * ref;
    }

    L2norm = sqrt(sum_delta2 / sum_ref2);

    printf("Shutting down...\n");
    sdkDeleteTimer(&hTimer);
    checkCudaErrors(cudaFree(d_Data));
    checkCudaErrors(cudaFree(d_Kernel));
    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Data);
    free(h_Kernel);

    printf("L2 norm: %E\n", L2norm);
    printf(L2norm < 1e-6 ? "Test passed\n" : "Test failed!\n");
}
/*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample implements a conjugate gradient solver on GPU using
 * Multi Block Cooperative Groups, also uses Unified Memory.
 *
 */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <cuda_runtime.h>

// Utilities and system includes
#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
#include <helper_cuda.h>       // helper function CUDA error checking and initialization

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

const char *sSDKname     = "conjugateGradientMultiBlockCG";

#define ENABLE_CPU_DEBUG_CODE 0
#define THREADS_PER_BLOCK 512

/* genTridiag: generate a random tridiagonal symmetric matrix */
void genTridiag(int *I, int *J, float *val, int N, int nz)
{
    I[0] = 0, J[0] = 0, J[1] = 1;
    val[0] = (float)rand()/RAND_MAX + 10.0f;
    val[1] = (float)rand()/RAND_MAX;
    int start;

    for (int i = 1; i < N; i++)
    {
        if (i > 1)
        {
            I[i] = I[i-1]+3;
        }
        else
        {
            I[1] = 2;
        }

        start = (i-1)*3 + 2;
        J[start] = i - 1;
        J[start+1] = i;

        if (i < N-1)
        {
            J[start+2] = i + 1;
        }

        val[start] = val[start-1];
        val[start+1] = (float)rand()/RAND_MAX + 10.0f;

        if (i < N-1)
        {
            val[start+2] = (float)rand()/RAND_MAX;
        }
    }

    I[N] = nz;
}

// I - contains location of the given non-zero element in the row of the matrix
// J - contains location of the given non-zero element in the column of the matrix
// val - contains values of the given non-zero elements of the matrix
// inputVecX - input vector to be multiplied
// outputVecY - resultant vector
void cpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha, float *inputVecX, float *outputVecY)
{
    for (int i=0; i < num_rows; i++)
    {
        int num_elems_this_row = I[i+1] - I[i];

        float output = 0.0;
        for (int j=0; j < num_elems_this_row; j++)
        {
            output +=  alpha*val[I[i] + j] * inputVecX[J[I[i] + j]];
        }
        outputVecY[i] = output;
    }

    return;
}

double dotProduct(float *vecA, float *vecB, int size)
{
    double result = 0.0;

    for (int i=0; i < size; i++)
    {
        result = result + (vecA[i] * vecB[i]);
    }

    return result;
}

void scaleVector(float *vec, float alpha, int size)
{
    for (int i=0; i < size; i++)
    {
        vec[i] = alpha*vec[i];
    }
}

void saxpy(float *x, float *y, float a, int size)
{
    for (int i=0; i < size; i++)
    {
        y[i] = a*x[i] + y[i];
    }
}

void cpuConjugateGrad(int *I, int *J, float *val, float *x, float *Ax, float *p, float *r, int nnz, int N, float tol)
{
    int max_iter = 10000;

    float alpha = 1.0;
    float alpham1 = -1.0;
    float r0 = 0.0, b, a, na;

    cpuSpMV(I, J, val, nnz, N, alpha, x, Ax);
    saxpy(Ax, r, alpham1, N);

    float r1 = dotProduct(r, r, N);

    int k = 1;

    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            scaleVector(p, b, N);

            saxpy(r, p, alpha, N);
        }
        else
        {
            for(int i=0; i < N; i++)
                p[i] = r[i];
        }

        cpuSpMV(I, J, val, nnz, N, alpha, p, Ax);

        float dot = dotProduct(p, Ax, N);
        a = r1 / dot;

        saxpy(p, x, a, N);
        na = -a;
        saxpy(Ax, r, na, N);

        r0 = r1;
        r1 = dotProduct(r, r, N);

        printf("\nCPU code iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }

}

__device__ void gpuSpMV(int *I, int *J, float *val, int nnz, int num_rows, float alpha, float *inputVecX,
                        float *outputVecY, cg::thread_block &cta, const cg::grid_group &grid)
{
    for (int i=grid.thread_rank(); i < num_rows; i+= grid.size())
    {
        int row_elem = I[i];
        int next_row_elem = I[i+1];
        int num_elems_this_row = next_row_elem - row_elem;

        float output = 0.0;
        for (int j=0; j < num_elems_this_row; j++)
        {
            // I or J or val arrays - can be put in shared memory
            // as the access is random and reused in next calls of gpuSpMV function.
            output +=  alpha*val[row_elem + j] * inputVecX[J[row_elem + j]];
        }

        outputVecY[i] = output;
    }
}

__device__ void gpuSaxpy(float *x, float *y, float a, int size, const cg::grid_group &grid)
{
    for (int i=grid.thread_rank(); i < size; i+= grid.size())
    {
        y[i] = a*x[i] + y[i];
    }
}

__device__ void gpuDotProduct(float *vecA, float *vecB, double *result, int size, const cg::thread_block &cta, const cg::grid_group &grid)
{
   __shared__ double tmp[THREADS_PER_BLOCK];

    double temp_sum = 0.0;
    for (int i=grid.thread_rank(); i < size; i+=grid.size())
    {
        temp_sum += (double) (vecA[i] * vecB[i]);
    }
    tmp[cta.thread_rank()] = temp_sum;

    cg::sync(cta);

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    double beta  = temp_sum;
    double temp;

    for (int i = tile32.size() / 2; i > 0; i >>= 1) {
        if (tile32.thread_rank() < i) {
            temp       = tmp[cta.thread_rank() + i];
            beta       += temp;
            tmp[cta.thread_rank()] = beta;
        }
        cg::sync(tile32);
    }
    cg::sync(cta);

    if (cta.thread_rank() == 0) {
        beta  = 0.0;
        for (int i = 0; i < cta.size(); i += tile32.size()) {
            beta  += tmp[i];
        }
        atomicAdd(result, beta);
    }
}

__device__ void gpuCopyVector(float *srcA, float *destB, int size, const cg::grid_group &grid)
{
    for (int i=grid.thread_rank(); i < size; i+= grid.size())
    {
        destB[i] = srcA[i];
    }
}

__device__ void gpuScaleVector(float *vec, float alpha, int size, const cg::grid_group &grid)
{
    for (int i=grid.thread_rank(); i < size; i+= grid.size())
    {
        vec[i] = alpha*vec[i];
    }
}


extern "C" __global__ void gpuConjugateGradient(int *I, int *J, float *val, float *x,  float *Ax, float *p, float *r, double *dot_result, int nnz, int N, float tol)
{
    cg::thread_block cta = cg::this_thread_block();
    cg::grid_group grid = cg::this_grid();

    int max_iter = 10000;

    float alpha = 1.0;
    float alpham1 = -1.0;
    float r0 = 0.0, r1, b, a, na;

    gpuSpMV(I, J, val, nnz, N, alpha, x, Ax, cta, grid);

    cg::sync(grid);

    gpuSaxpy(Ax, r, alpham1, N, grid);

    cg::sync(grid);

    gpuDotProduct(r, r, dot_result, N, cta, grid);

    cg::sync(grid);

    r1 = *dot_result;

    int k = 1;
    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            gpuScaleVector(p, b, N, grid);

            cg::sync(grid);
            gpuSaxpy(r, p, alpha, N, grid);
        }
        else
        {
            gpuCopyVector(r, p, N, grid);
        }

        cg::sync(grid);

        gpuSpMV(I, J, val, nnz, N, alpha, p, Ax, cta, grid);

        if (threadIdx.x == 0 && blockIdx.x == 0)
            *dot_result = 0.0;

        cg::sync(grid);

        gpuDotProduct(p, Ax, dot_result, N, cta, grid);

        cg::sync(grid);

        a = r1 / *dot_result;

        gpuSaxpy(p, x, a, N, grid);
        na = -a;
        gpuSaxpy(Ax, r, na, N, grid);

        r0 = r1;

        if (threadIdx.x == 0 && blockIdx.x == 0)
            *dot_result = 0.0;

        cg::sync(grid);

        gpuDotProduct(r, r, dot_result, N, cta, grid);

        cg::sync(grid);

        r1 = *dot_result;
        k++;
    }
}

bool areAlmostEqual(float a, float b, float maxRelDiff)
{
    float diff = fabsf(a - b);
    float abs_a = fabsf(a);
    float abs_b = fabsf(b);
    float largest = abs_a > abs_b? abs_a : abs_b;

    if(diff <= largest * maxRelDiff)
    {
        return true;
    }
    else
    {
        printf("maxRelDiff = %.8e\n", maxRelDiff);
        printf("diff %.8e > largest * maxRelDiff %.8e therefore %.8e and %.8e are not same\n", diff, largest * maxRelDiff, a, b);
        return false;
    }
}

int main(int argc, char **argv)
{
    int N = 0, nz = 0, *I = NULL, *J = NULL;
    float *val = NULL;
    const float tol = 1e-5f;
    float *x;
    float *rhs;
    float r1;
    float *r, *p, *Ax;
    cudaEvent_t start, stop;

    printf("Starting [%s]...\n", sSDKname);

    // This will pick the best possible CUDA capable device
    cudaDeviceProp deviceProp;
    int devID = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    if (!deviceProp.managedMemory) {
        // This sample requires being run on a device that supports Unified Memory
        fprintf(stderr, "Unified Memory not supported on this device\n");
        exit(EXIT_WAIVED);
    }

    // This sample requires being run on a device that supports Cooperative Kernel Launch
    if (!deviceProp.cooperativeLaunch)
    {
        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run\n", devID);
        exit(EXIT_WAIVED);
    }

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    /* Generate a random tridiagonal symmetric matrix in CSR format */
    N = 1048576;
    nz = (N-2)*3 + 4;

    cudaMallocManaged((void **)&I, sizeof(int)*(N+1));
    cudaMallocManaged((void **)&J, sizeof(int)*nz);
    cudaMallocManaged((void **)&val, sizeof(float)*nz);

    genTridiag(I, J, val, N, nz);

    cudaMallocManaged((void **)&x, sizeof(float)*N);
    cudaMallocManaged((void **)&rhs, sizeof(float)*N);

    double *dot_result;

    cudaMallocManaged((void **)&dot_result, sizeof(double));

    *dot_result = 0.0;

    // temp memory for CG
    checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float)));
    checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float)));
    checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float)));

    cudaDeviceSynchronize();

    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

#if ENABLE_CPU_DEBUG_CODE
    float *Ax_cpu = (float *) malloc(sizeof(float)*N);
    float *r_cpu = (float *) malloc(sizeof(float)*N);
    float *p_cpu = (float *) malloc(sizeof(float)*N);
    float *x_cpu = (float *) malloc(sizeof(float)*N);

    for (int i=0; i < N; i++)
    {
        r_cpu[i] = 1.0;
        Ax_cpu[i] = x_cpu[i] = 0.0;
    }

#endif

    for (int i = 0; i < N; i++)
    {
        r[i] = rhs[i] = 1.0;
        x[i] = 0.0;
    }

    void *kernelArgs[] = {
        (void*)&I,
        (void*)&J,
        (void*)&val,
        (void*)&x,
        (void*)&Ax,
        (void*)&p,
        (void*)&r,
        (void*)&dot_result,
        (void*)&nz,
        (void*)&N,
        (void*)&tol,
    };

    int sMemSize = sizeof(double) * THREADS_PER_BLOCK;
    int numBlocksPerSm = 0;
    int numThreads = THREADS_PER_BLOCK;

    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, gpuConjugateGradient, numThreads, sMemSize));

    int numSms = deviceProp.multiProcessorCount;
    dim3 dimGrid(numSms*numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1);
    checkCudaErrors(cudaEventRecord(start, 0));
    checkCudaErrors(cudaLaunchCooperativeKernel((void *)gpuConjugateGradient, dimGrid, dimBlock, kernelArgs, sMemSize, NULL));
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaDeviceSynchronize());

    float time;
    checkCudaErrors(cudaEventElapsedTime(&time, start, stop));

    r1 = *dot_result;

    printf("GPU Final, residual = %e, kernel execution time = %f ms\n", sqrt(r1), time);

#if ENABLE_CPU_DEBUG_CODE
    cpuConjugateGrad(I, J, val, x_cpu, Ax_cpu, p_cpu, r_cpu, nz, N, tol);
#endif

    float rsum, diff, err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    checkCudaErrors(cudaFree(I));
    checkCudaErrors(cudaFree(J));
    checkCudaErrors(cudaFree(val));
    checkCudaErrors(cudaFree(x));
    checkCudaErrors(cudaFree(rhs));
    checkCudaErrors(cudaFree(r));
    checkCudaErrors(cudaFree(p));
    checkCudaErrors(cudaFree(Ax));
    checkCudaErrors(cudaFree(dot_result));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));

#if ENABLE_CPU_DEBUG_CODE
    free(Ax_cpu);
    free(r_cpu);
    free(p_cpu);
    free(x_cpu);
#endif

    printf("Test Summary:  Error amount = %f \n", err);
    fprintf(stdout,"&&&& conjugateGradientMultiBlockCG %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED");
    exit((sqrt(r1) < tol) ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "FDTD3dGPU.h"

#include <iostream>
#include <algorithm>
#include <helper_functions.h>
#include <helper_cuda.h>

#include "FDTD3dGPUKernel.cuh"

bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv)
{
    int               deviceCount  = 0;
    int               targetDevice = 0;
    size_t            memsize      = 0;

    // Get the number of CUDA enabled GPU devices
    printf(" cudaGetDeviceCount\n");
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

    // Select target device (device 0 by default)
    targetDevice = findCudaDevice(argc, (const char **)argv);

    // Query target device for maximum memory allocation
    printf(" cudaGetDeviceProperties\n");
    struct cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, targetDevice));

    memsize = deviceProp.totalGlobalMem;

    // Save the result
    *result = (memsize_t)memsize;
    return true;
}

bool fdtdGPU(float *output, const float *input, const float *coeff, const int dimx, const int dimy, const int dimz, const int radius, const int timesteps, const int argc, const char **argv)
{
    const int         outerDimx  = dimx + 2 * radius;
    const int         outerDimy  = dimy + 2 * radius;
    const int         outerDimz  = dimz + 2 * radius;
    const size_t      volumeSize = outerDimx * outerDimy * outerDimz;
    int               deviceCount  = 0;
    int               targetDevice = 0;
    float            *bufferOut    = 0;
    float            *bufferIn     = 0;
    dim3              dimBlock;
    dim3              dimGrid;

    // Ensure that the inner data starts on a 128B boundary
    const int padding = (128 / sizeof(float)) - radius;
    const size_t paddedVolumeSize = volumeSize + padding;

#ifdef GPU_PROFILING
    cudaEvent_t profileStart = 0;
    cudaEvent_t profileEnd   = 0;
    const int profileTimesteps = timesteps - 1;

    if (profileTimesteps < 1)
    {
        printf(" cannot profile with fewer than two timesteps (timesteps=%d), profiling is disabled.\n", timesteps);
    }

#endif

    // Check the radius is valid
    if (radius != RADIUS)
    {
        printf("radius is invalid, must be %d - see kernel for details.\n", RADIUS);
        exit(EXIT_FAILURE);
    }

    // Get the number of CUDA enabled GPU devices
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

    // Select target device (device 0 by default)
    targetDevice = findCudaDevice(argc, (const char **)argv);

    checkCudaErrors(cudaSetDevice(targetDevice));

    // Allocate memory buffers
    checkCudaErrors(cudaMalloc((void **)&bufferOut, paddedVolumeSize * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&bufferIn, paddedVolumeSize * sizeof(float)));

    // Check for a command-line specified block size
    int userBlockSize;

    if (checkCmdLineFlag(argc, (const char **)argv, "block-size"))
    {
        userBlockSize = getCmdLineArgumentInt(argc, argv, "block-size");
        // Constrain to a multiple of k_blockDimX
        userBlockSize = (userBlockSize / k_blockDimX * k_blockDimX);

        // Constrain within allowed bounds
        userBlockSize = MIN(MAX(userBlockSize, k_blockSizeMin), k_blockSizeMax);
    }
    else
    {
        userBlockSize = k_blockSizeMax;
    }

    // Check the device limit on the number of threads
    struct cudaFuncAttributes funcAttrib;
    checkCudaErrors(cudaFuncGetAttributes(&funcAttrib, FiniteDifferencesKernel));

    userBlockSize = MIN(userBlockSize, funcAttrib.maxThreadsPerBlock);

    // Set the block size
    dimBlock.x = k_blockDimX;
    // Visual Studio 2005 does not like std::min
    //    dimBlock.y = std::min<size_t>(userBlockSize / k_blockDimX, (size_t)k_blockDimMaxY);
    dimBlock.y = ((userBlockSize / k_blockDimX) < (size_t)k_blockDimMaxY) ? (userBlockSize / k_blockDimX) : (size_t)k_blockDimMaxY;
    dimGrid.x  = (unsigned int)ceil((float)dimx / dimBlock.x);
    dimGrid.y  = (unsigned int)ceil((float)dimy / dimBlock.y);
    printf(" set block size to %dx%d\n", dimBlock.x, dimBlock.y);
    printf(" set grid size to %dx%d\n", dimGrid.x, dimGrid.y);

    // Check the block size is valid
    if (dimBlock.x < RADIUS || dimBlock.y < RADIUS)
    {
        printf("invalid block size, x (%d) and y (%d) must be >= radius (%d).\n", dimBlock.x, dimBlock.y, RADIUS);
        exit(EXIT_FAILURE);
    }

    // Copy the input to the device input buffer
    checkCudaErrors(cudaMemcpy(bufferIn + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice));

    // Copy the input to the device output buffer (actually only need the halo)
    checkCudaErrors(cudaMemcpy(bufferOut + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice));

    // Copy the coefficients to the device coefficient buffer
    checkCudaErrors(cudaMemcpyToSymbol(stencil, (void *)coeff, (radius + 1) * sizeof(float)));


#ifdef GPU_PROFILING

    // Create the events
    checkCudaErrors(cudaEventCreate(&profileStart));
    checkCudaErrors(cudaEventCreate(&profileEnd));

#endif

    // Execute the FDTD
    float *bufferSrc = bufferIn + padding;
    float *bufferDst = bufferOut + padding;
    printf(" GPU FDTD loop\n");


#ifdef GPU_PROFILING
    // Enqueue start event
    checkCudaErrors(cudaEventRecord(profileStart, 0));
#endif

    for (int it = 0 ; it < timesteps ; it++)
    {
        printf("\tt = %d ", it);

        // Launch the kernel
        printf("launch kernel\n");
        FiniteDifferencesKernel<<<dimGrid, dimBlock>>>(bufferDst, bufferSrc, dimx, dimy, dimz);

        // Toggle the buffers
        // Visual Studio 2005 does not like std::swap
        //    std::swap<float *>(bufferSrc, bufferDst);
        float *tmp = bufferDst;
        bufferDst = bufferSrc;
        bufferSrc = tmp;
    }

    printf("\n");

#ifdef GPU_PROFILING
    // Enqueue end event
    checkCudaErrors(cudaEventRecord(profileEnd, 0));
#endif

    // Wait for the kernel to complete
    checkCudaErrors(cudaDeviceSynchronize());

    // Read the result back, result is in bufferSrc (after final toggle)
    checkCudaErrors(cudaMemcpy(output, bufferSrc, volumeSize * sizeof(float), cudaMemcpyDeviceToHost));

    // Report time
#ifdef GPU_PROFILING
    float elapsedTimeMS = 0;

    if (profileTimesteps > 0)
    {
        checkCudaErrors(cudaEventElapsedTime(&elapsedTimeMS, profileStart, profileEnd));
    }

    if (profileTimesteps > 0)
    {
        // Convert milliseconds to seconds
        double elapsedTime    = elapsedTimeMS * 1.0e-3;
        double avgElapsedTime = elapsedTime / (double)profileTimesteps;
        // Determine number of computations per timestep
        size_t pointsComputed = dimx * dimy * dimz;
        // Determine throughput
        double throughputM    = 1.0e-6 * (double)pointsComputed / avgElapsedTime;
        printf("FDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, NumDevsUsed = %u, Blocksize = %u\n",
               throughputM, avgElapsedTime, pointsComputed, 1, dimBlock.x * dimBlock.y);
    }

#endif

    // Cleanup
    if (bufferIn)
    {
        checkCudaErrors(cudaFree(bufferIn));
    }

    if (bufferOut)
    {
        checkCudaErrors(cudaFree(bufferOut));
    }

#ifdef GPU_PROFILING

    if (profileStart)
    {
        checkCudaErrors(cudaEventDestroy(profileStart));
    }

    if (profileEnd)
    {
        checkCudaErrors(cudaEventDestroy(profileEnd));
    }

#endif
    return true;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "FDTD3dGPU.h"
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

// Note: If you change the RADIUS, you should also change the unrolling below
#define RADIUS 4

__constant__ float stencil[RADIUS + 1];

__global__ void FiniteDifferencesKernel(float *output,
                                        const float *input,
                                        const int dimx,
                                        const int dimy,
                                        const int dimz)
{
    bool validr = true;
    bool validw = true;
    const int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
    const int gtidy = blockIdx.y * blockDim.y + threadIdx.y;
    const int ltidx = threadIdx.x;
    const int ltidy = threadIdx.y;
    const int workx = blockDim.x;
    const int worky = blockDim.y;
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ float tile[k_blockDimMaxY + 2 * RADIUS][k_blockDimX + 2 * RADIUS];

    const int stride_y = dimx + 2 * RADIUS;
    const int stride_z = stride_y * (dimy + 2 * RADIUS);

    int inputIndex  = 0;
    int outputIndex = 0;

    // Advance inputIndex to start of inner volume
    inputIndex += RADIUS * stride_y + RADIUS;

    // Advance inputIndex to target element
    inputIndex += gtidy * stride_y + gtidx;

    float infront[RADIUS];
    float behind[RADIUS];
    float current;

    const int tx = ltidx + RADIUS;
    const int ty = ltidy + RADIUS;

    // Check in bounds
    if ((gtidx >= dimx + RADIUS) || (gtidy >= dimy + RADIUS))
        validr = false;

    if ((gtidx >= dimx) || (gtidy >= dimy))
        validw = false;

    // Preload the "infront" and "behind" data
    for (int i = RADIUS - 2 ; i >= 0 ; i--)
    {
        if (validr)
            behind[i] = input[inputIndex];

        inputIndex += stride_z;
    }

    if (validr)
        current = input[inputIndex];

    outputIndex = inputIndex;
    inputIndex += stride_z;

    for (int i = 0 ; i < RADIUS ; i++)
    {
        if (validr)
            infront[i] = input[inputIndex];

        inputIndex += stride_z;
    }

    // Step through the xy-planes
#pragma unroll 9

    for (int iz = 0 ; iz < dimz ; iz++)
    {
        // Advance the slice (move the thread-front)
        for (int i = RADIUS - 1 ; i > 0 ; i--)
            behind[i] = behind[i - 1];

        behind[0] = current;
        current = infront[0];
#pragma unroll 4

        for (int i = 0 ; i < RADIUS - 1 ; i++)
            infront[i] = infront[i + 1];

        if (validr)
            infront[RADIUS - 1] = input[inputIndex];

        inputIndex  += stride_z;
        outputIndex += stride_z;
        cg::sync(cta);

        // Note that for the work items on the boundary of the problem, the
        // supplied index when reading the halo (below) may wrap to the
        // previous/next row or even the previous/next xy-plane. This is
        // acceptable since a) we disable the output write for these work
        // items and b) there is at least one xy-plane before/after the
        // current plane, so the access will be within bounds.

        // Update the data slice in the local tile
        // Halo above & below
        if (ltidy < RADIUS)
        {
            tile[ltidy][tx]                  = input[outputIndex - RADIUS * stride_y];
            tile[ltidy + worky + RADIUS][tx] = input[outputIndex + worky * stride_y];
        }

        // Halo left & right
        if (ltidx < RADIUS)
        {
            tile[ty][ltidx]                  = input[outputIndex - RADIUS];
            tile[ty][ltidx + workx + RADIUS] = input[outputIndex + workx];
        }

        tile[ty][tx] = current;
        cg::sync(cta);

        // Compute the output value
        float value = stencil[0] * current;
#pragma unroll 4

        for (int i = 1 ; i <= RADIUS ; i++)
        {
            value += stencil[i] * (infront[i-1] + behind[i-1] + tile[ty - i][tx] + tile[ty + i][tx] + tile[ty][tx - i] + tile[ty][tx + i]);
        }

        // Store the output value
        if (validw)
            output[outputIndex] = value;
    }
}
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
    Parallel reduction

    This sample shows how to perform a reduction operation on an array of values
    to produce a single value in a single kernel (as opposed to two or more
    kernel calls as shown in the "reduction" CUDA Sample).  Single-pass
    reduction requires Cooperative Groups.

    Reductions are a very common computation in parallel algorithms.  Any time
    an array of values needs to be reduced to a single value using a binary
    associative operator, a reduction can be used.  Example applications include
    statistics computations such as mean and standard deviation, and image
    processing applications such as finding the total luminance of an
    image.

    This code performs sum reductions, but any associative operator such as
    min() or max() could also be used.

    It assumes the input size is a power of 2.

    COMMAND LINE ARGUMENTS

    "--n=<N>":         Specify the number of elements to reduce (default 33554432)
    "--threads=<N>":   Specify the number of threads per block (default 128)
    "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
*/

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <helper_functions.h>
#include <helper_cuda.h>

#include <cuda_runtime.h>

const char *sSDKsample = "reductionMultiBlockCG";

#include <cuda_runtime_api.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

/*
    Parallel sum reduction using shared memory
    - takes log(n) steps for n input elements
    - uses n/2 threads
    - only works for power-of-2 arrays

    This version adds multiple elements per thread sequentially.  This reduces the overall
    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
    (Brent's Theorem optimization)

    See the CUDA SDK "reduction" sample for more information.
*/

__device__ void reduceBlock(double *sdata, const cg::thread_block &cta)
{
    const unsigned int tid = cta.thread_rank();
    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    double beta  = sdata[tid];
    double temp;

    for (int i = tile32.size() / 2; i > 0; i >>= 1) {
        if (tile32.thread_rank() < i) {
            temp       = sdata[tid+i];
            beta       += temp;
            sdata[tid] = beta;
        }
        cg::sync(tile32);
    }
    cg::sync(cta);

    if (cta.thread_rank() == 0) {
        beta  = 0;
        for (int i = 0; i < blockDim.x; i += tile32.size()) {
            beta  += sdata[i];
        }
        sdata[0] = beta;
    }
    cg::sync(cta);
}

// This reduction kernel reduces an arbitrary size array in a single kernel invocation
//
// For more details on the reduction algorithm (notably the multi-pass approach), see
// the "reduction" sample in the CUDA SDK.
extern "C" __global__ void reduceSinglePassMultiBlockCG(const float *g_idata, float *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block block = cg::this_thread_block();
    cg::grid_group grid = cg::this_grid();

    extern double __shared__ sdata[];

    // Stride over grid and add the values to a shared memory buffer
    sdata[block.thread_rank()] = 0;

    for (int i = grid.thread_rank(); i < n; i += grid.size()) {
        sdata[block.thread_rank()] += g_idata[i];
    }

    cg::sync(block);

    // Reduce each block (called once per block)
    reduceBlock(sdata, block);
    // Write out the result to global memory
    if (block.thread_rank() == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }
    cg::sync(grid);

    if (grid.thread_rank() == 0) {
        for (int block = 1; block < gridDim.x; block++) {
            g_odata[0] += g_odata[block];
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
void call_reduceSinglePassMultiBlockCG(int size, int threads, int numBlocks, float *d_idata, float *d_odata)
{
    int smemSize = threads * sizeof(double);
    void *kernelArgs[] = {
        (void*)&d_idata,
        (void*)&d_odata,
        (void*)&size,
    };

    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(numBlocks, 1, 1);

    cudaLaunchCooperativeKernel((void*)reduceSinglePassMultiBlockCG, dimGrid, dimBlock, kernelArgs, smemSize, NULL);
    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");
}


////////////////////////////////////////////////////////////////////////////////
// declaration, forward
bool runTest(int argc, char **argv, int device);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    cudaDeviceProp deviceProp = { 0 };
    int dev;

    printf("%s Starting...\n\n", sSDKsample);

    dev = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
    if (!deviceProp.cooperativeLaunch)
    {
        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run\n", dev);
        exit(EXIT_WAIVED);
    }

    bool bTestPassed = false;
    bTestPassed = runTest(argc, argv, dev);

    exit(bTestPassed ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data       pointer to input data
//! @param size       number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
T reduceCPU(T *data, int size)
{
    T sum = data[0];
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i] - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }

    return sum;
}

unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}


////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the reduction
// We set threads / block to the minimum of maxThreads and n/2.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
    if (n == 1)
    {
        threads = 1;
        blocks = 1;
    }
    else
    {
        threads = (n < maxThreads*2) ? nextPow2(n / 2) : maxThreads;
        blocks = max(1, n / (threads * 2));
    }

    blocks = min(maxBlocks, blocks);
}

////////////////////////////////////////////////////////////////////////////////
// This function performs a reduction of the input data multiple times and
// measures the average reduction time.
////////////////////////////////////////////////////////////////////////////////
float benchmarkReduce(int  n,
                      int  numThreads,
                      int  numBlocks,
                      int  maxThreads,
                      int  maxBlocks,
                      int  testIterations,
                      StopWatchInterface *timer,
                      float *h_odata,
                      float *d_idata,
                      float *d_odata)
{
    float gpu_result = 0;
    cudaError_t error;

    printf("\nLaunching %s kernel\n", "SinglePass Multi Block Cooperative Groups");
    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;
        sdkStartTimer(&timer);
        call_reduceSinglePassMultiBlockCG(n, numThreads, numBlocks, d_idata, d_odata);
        cudaDeviceSynchronize();
        sdkStopTimer(&timer);
    }

    // copy final sum from device to host
    error = cudaMemcpy(&gpu_result, d_odata, sizeof(float), cudaMemcpyDeviceToHost);
    checkCudaErrors(error);

    return gpu_result;
}

////////////////////////////////////////////////////////////////////////////////
// The main function which runs the reduction test.
////////////////////////////////////////////////////////////////////////////////
bool
runTest(int argc, char **argv, int device)
{
    int size = 1 << 25;    // number of elements to reduce
    bool bTestPassed = false;

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        size = getCmdLineArgumentInt(argc, (const char **)argv, "n");
    }

    printf("%d elements\n", size);

    // Set the device to be used
    cudaDeviceProp prop = { 0 };
    checkCudaErrors(cudaSetDevice(device));
    checkCudaErrors(cudaGetDeviceProperties(&prop, device));

    // create random input data on CPU
    unsigned int bytes = size * sizeof(float);

    float *h_idata = (float *) malloc(bytes);

    for (int i = 0; i < size; i++)
    {
        // Keep the numbers small so we don't get truncation error in the sum
        h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX;
    }

    // Determine the launch configuration (threads, blocks)
    int maxThreads = 0;
    int maxBlocks = 0;

    if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    {
        maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads");
    }
    else
    {
        maxThreads = prop.maxThreadsPerBlock;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    {
        maxBlocks  = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks");
    }
    else
    {
        maxBlocks = prop.multiProcessorCount * (prop.maxThreadsPerMultiProcessor / prop.maxThreadsPerBlock);
    }

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads);

    // We calculate the occupancy to know how many block can actually fit on the GPU
    int numBlocksPerSm = 0;
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, reduceSinglePassMultiBlockCG, numThreads, numThreads*sizeof(double)));

    int numSms = prop.multiProcessorCount;
    if (numBlocks > numBlocksPerSm * numSms)
    {
        numBlocks = numBlocksPerSm * numSms;
    }
    printf("numThreads: %d\n", numThreads);
    printf("numBlocks: %d\n", numBlocks);

    // allocate mem for the result on host side
    float *h_odata = (float *) malloc(numBlocks*sizeof(float));

    // allocate device memory and data
    float *d_idata = NULL;
    float *d_odata = NULL;

    checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
    checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(float)));

    // copy data directly to device memory
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(float), cudaMemcpyHostToDevice));

    int testIterations = 100;

    StopWatchInterface *timer = 0;
    sdkCreateTimer(&timer);

    float gpu_result = 0;

    gpu_result = benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                 testIterations, timer, h_odata, d_idata, d_odata);

    float reduceTime = sdkGetAverageTimerValue(&timer);
    printf("Average time: %f ms\n", reduceTime);
    printf("Bandwidth:    %f GB/s\n\n", (size * sizeof(int)) / (reduceTime * 1.0e6));

    // compute reference solution
    float cpu_result = reduceCPU<float>(h_idata, size);
    printf("GPU result = %0.12f\n", gpu_result);
    printf("CPU result = %0.12f\n", cpu_result);

    double threshold = 1e-8 * size;
    double diff = abs((double)gpu_result - (double)cpu_result);
    bTestPassed = (diff < threshold);

    // cleanup
    sdkDeleteTimer(&timer);

    free(h_idata);
    free(h_odata);
    cudaFree(d_idata);
    cudaFree(d_odata);

    return bTestPassed;
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
    Parallel reduction kernels
*/

#ifndef _REDUCE_KERNEL_H_
#define _REDUCE_KERNEL_H_

#include <cuda_runtime_api.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

/*
    Parallel sum reduction using shared memory
    - takes log(n) steps for n input elements
    - uses n/2 threads
    - only works for power-of-2 arrays

    This version adds multiple elements per thread sequentially.  This reduces the overall
    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
    (Brent's Theorem optimization)

    See the CUDA SDK "reduction" sample for more information.
*/

template <unsigned int blockSize>
__device__ void
reduceBlock(volatile float *sdata, float mySum, const unsigned int tid, cg::thread_block cta)
{
    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
    sdata[tid] = mySum;
    cg::sync(tile32);

    const int VEC = 32;
    const int vid = tid & (VEC-1);

    float beta  = mySum;
    float temp;

    for (int i = VEC/2; i > 0; i>>=1)
    {
        if (vid < i)
        {
            temp      = sdata[tid+i];
            beta     += temp;
            sdata[tid]  = beta;
        }
        cg::sync(tile32);
    }
    cg::sync(cta);

    if (cta.thread_rank() == 0)
    {
        beta  = 0;
        for (int i = 0; i < blockDim.x; i += VEC)
        {
            beta  += sdata[i];
        }
        sdata[0] = beta;
    }
    cg::sync(cta);
}

template <unsigned int blockSize, bool nIsPow2>
__device__ void
reduceBlocks(const float *g_idata, float *g_odata, unsigned int n, cg::thread_block cta)
{
    extern __shared__ float sdata[];

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
    unsigned int gridSize = blockSize*2*gridDim.x;
    float mySum = 0;

    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n)
    {
        mySum += g_idata[i];

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n)
            mySum += g_idata[i+blockSize];

        i += gridSize;
    }

    // do reduction in shared mem
    reduceBlock<blockSize>(sdata, mySum, tid, cta);

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}


template <unsigned int blockSize, bool nIsPow2>
__global__ void
reduceMultiPass(const float *g_idata, float *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    reduceBlocks<blockSize, nIsPow2>(g_idata, g_odata, n, cta);
}

// Global variable used by reduceSinglePass to count how many blocks have finished
__device__ unsigned int retirementCount = 0;

cudaError_t setRetirementCount(int retCnt)
{
    return cudaMemcpyToSymbol(retirementCount, &retCnt, sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
}

// This reduction kernel reduces an arbitrary size array in a single kernel invocation
// It does so by keeping track of how many blocks have finished.  After each thread
// block completes the reduction of its own block of data, it "takes a ticket" by
// atomically incrementing a global counter.  If the ticket value is equal to the number
// of thread blocks, then the block holding the ticket knows that it is the last block
// to finish.  This last block is responsible for summing the results of all the other
// blocks.
//
// In order for this to work, we must be sure that before a block takes a ticket, all
// of its memory transactions have completed.  This is what __threadfence() does -- it
// blocks until the results of all outstanding memory transactions within the
// calling thread are visible to all other threads.
//
// For more details on the reduction algorithm (notably the multi-pass approach), see
// the "reduction" sample in the CUDA SDK.
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduceSinglePass(const float *g_idata, float *g_odata, unsigned int n)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //
    // PHASE 1: Process all inputs assigned to this block
    //

    reduceBlocks<blockSize, nIsPow2>(g_idata, g_odata, n, cta);

    //
    // PHASE 2: Last block finished will process all partial sums
    //

    if (gridDim.x > 1)
    {
        const unsigned int tid = threadIdx.x;
        __shared__ bool amLast;
        extern float __shared__ smem[];

        // wait until all outstanding memory instructions in this thread are finished
        __threadfence();

        // Thread 0 takes a ticket
        if (tid==0)
        {
            unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
            // If the ticket ID is equal to the number of blocks, we are the last block!
            amLast = (ticket == gridDim.x-1);
        }

        cg::sync(cta);

        // The last block sums the results of all other blocks
        if (amLast)
        {
            int i = tid;
            float mySum = 0;

            while (i < gridDim.x)
            {
                mySum += g_odata[i];
                i += blockSize;
            }

            reduceBlock<blockSize>(smem, mySum, tid, cta);

            if (tid==0)
            {
                g_odata[0] = smem[0];

                // reset retirement count so that next run succeeds
                retirementCount = 0;
            }
        }
    }
}

bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}


////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
extern "C"
void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata)
{
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

    // choose which of the optimized versions of reduction to launch
    if (isPow2(size))
    {
        switch (threads)
        {
            case 512:
                reduceMultiPass<512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 256:
                reduceMultiPass<256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 128:
                reduceMultiPass<128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 64:
                reduceMultiPass< 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 32:
                reduceMultiPass< 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 16:
                reduceMultiPass< 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  8:
                reduceMultiPass<  8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  4:
                reduceMultiPass<  4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  2:
                reduceMultiPass<  2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  1:
                reduceMultiPass<  1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;
        }
    }
    else
    {
        switch (threads)
        {
            case 512:
                reduceMultiPass<512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 256:
                reduceMultiPass<256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 128:
                reduceMultiPass<128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 64:
                reduceMultiPass< 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 32:
                reduceMultiPass< 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 16:
                reduceMultiPass< 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  8:
                reduceMultiPass<  8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  4:
                reduceMultiPass<  4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  2:
                reduceMultiPass<  2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  1:
                reduceMultiPass<  1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;
        }
    }
}

extern "C"
void reduceSinglePass(int size, int threads, int blocks, float *d_idata, float *d_odata)
{
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);
    int smemSize = threads * sizeof(float);

    // choose which of the optimized versions of reduction to launch
    if (isPow2(size))
    {
        switch (threads)
        {
            case 512:
                reduceSinglePass<512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 256:
                reduceSinglePass<256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 128:
                reduceSinglePass<128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 64:
                reduceSinglePass< 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 32:
                reduceSinglePass< 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 16:
                reduceSinglePass< 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  8:
                reduceSinglePass<  8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  4:
                reduceSinglePass<  4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  2:
                reduceSinglePass<  2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  1:
                reduceSinglePass<  1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;
        }
    }
    else
    {
        switch (threads)
        {
            case 512:
                reduceSinglePass<512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 256:
                reduceSinglePass<256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 128:
                reduceSinglePass<128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 64:
                reduceSinglePass< 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 32:
                reduceSinglePass< 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case 16:
                reduceSinglePass< 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  8:
                reduceSinglePass<  8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  4:
                reduceSinglePass<  4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  2:
                reduceSinglePass<  2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;

            case  1:
                reduceSinglePass<  1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
                break;
        }
    }
}

#endif // #ifndef _REDUCE_KERNEL_H_
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
    Parallel reduction

    This sample shows how to perform a reduction operation on an array of values
    to produce a single value in a single kernel (as opposed to two or more
    kernel calls as shown in the "reduction" CUDA Sample).  Single-pass
    reduction requires global atomic instructions (Compute Capability 1.1 or
    later) and the __threadfence() intrinsic (CUDA 2.2 or later).

    Reductions are a very common computation in parallel algorithms.  Any time
    an array of values needs to be reduced to a single value using a binary
    associative operator, a reduction can be used.  Example applications include
    statistics computations such as mean and standard deviation, and image
    processing applications such as finding the total luminance of an
    image.

    This code performs sum reductions, but any associative operator such as
    min() or max() could also be used.

    It assumes the input size is a power of 2.

    COMMAND LINE ARGUMENTS

    "--shmoo":         Test performance for 1 to 32M elements with each of the 7 different kernels
    "--n=<N>":         Specify the number of elements to reduce (default 1048576)
    "--threads=<N>":   Specify the number of threads per block (default 128)
    "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
    "--cpufinal":      Read back the per-block results and do final sum of block sums on CPU (default false)
    "--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
    "--multipass":     Use a multipass reduction instead of a single-pass reduction

*/

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <helper_functions.h>
#include <helper_cuda.h>

#define VERSION_MAJOR (CUDART_VERSION/1000)
#define VERSION_MINOR (CUDART_VERSION%100)/10

const char *sSDKsample = "threadFenceReduction";

#if CUDART_VERSION >= 2020
#include "threadFenceReduction_kernel.cuh"
#else
#pragma comment(user, "CUDA 2.2 is required to build for threadFenceReduction")
#endif

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
bool runTest(int argc, char **argv);

extern "C"
{
    void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata);
    void reduceSinglePass(int size, int threads, int blocks, float *d_idata, float *d_odata);
}

#if CUDART_VERSION < 2020
void reduce(int size, int threads, int blocks, float *d_idata, float *d_odata)
{
    printf("reduce(), compiler not supported, aborting tests\n");
}

void reduceSinglePass(int size, int threads, int blocks, float *d_idata, float *d_odata)
{
    printf("reduceSinglePass(), compiler not supported, aborting tests\n");
}
#endif


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

    printf("%s Starting...\n\n", sSDKsample);

    dev = findCudaDevice(argc, (const char **)argv);

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));

    printf("GPU Device supports SM %d.%d compute capability\n\n", deviceProp.major, deviceProp.minor);

    bool bTestResult = false;

#if CUDART_VERSION >= 2020
    bTestResult = runTest(argc, argv);
#else
    print_NVCC_min_spec(sSDKsample, "2.2", "Version 185");
    exit(EXIT_SUCCESS);
#endif

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data       pointer to input data
//! @param size       number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
T reduceCPU(T *data, int size)
{
    T sum = data[0];
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i] - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }

    return sum;
}

unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}


////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the reduction
// We set threads / block to the minimum of maxThreads and n/2.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
    if (n == 1)
    {
        threads = 1;
        blocks = 1;
    }
    else
    {
        threads = (n < maxThreads*2) ? nextPow2(n / 2) : maxThreads;
        blocks = max(1, n / (threads * 2));
    }

    blocks = min(maxBlocks, blocks);
}

////////////////////////////////////////////////////////////////////////////////
// This function performs a reduction of the input data multiple times and
// measures the average reduction time.
////////////////////////////////////////////////////////////////////////////////
float benchmarkReduce(int  n,
                      int  numThreads,
                      int  numBlocks,
                      int  maxThreads,
                      int  maxBlocks,
                      int  testIterations,
                      bool multiPass,
                      bool cpuFinalReduction,
                      int  cpuFinalThreshold,
                      StopWatchInterface *timer,
                      float *h_odata,
                      float *d_idata,
                      float *d_odata)
{
    float gpu_result = 0;
    bool bNeedReadback = true;
    cudaError_t error;

    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;
        unsigned int retCnt = 0;
        error = setRetirementCount(retCnt);
        checkCudaErrors(error);

        cudaDeviceSynchronize();
        sdkStartTimer(&timer);

        if (multiPass)
        {
            // execute the kernel
            reduce(n, numThreads, numBlocks, d_idata, d_odata);

            // check if kernel execution generated an error
            getLastCudaError("Kernel execution failed");

            if (cpuFinalReduction)
            {
                // sum partial sums from each block on CPU
                // copy result from device to host
                error = cudaMemcpy(h_odata, d_odata, numBlocks*sizeof(float), cudaMemcpyDeviceToHost);
                checkCudaErrors(error);

                for (int i=0; i<numBlocks; i++)
                {
                    gpu_result += h_odata[i];
                }

                bNeedReadback = false;
            }
            else
            {
                // sum partial block sums on GPU
                int s=numBlocks;

                while (s > cpuFinalThreshold)
                {
                    int threads = 0, blocks = 0;
                    getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);

                    reduce(s, threads, blocks, d_odata, d_odata);

                    s = s / (threads*2);
                }

                if (s > 1)
                {
                    // copy result from device to host
                    error = cudaMemcpy(h_odata, d_odata, s * sizeof(float), cudaMemcpyDeviceToHost);
                    checkCudaErrors(error);

                    for (int i=0; i < s; i++)
                    {
                        gpu_result += h_odata[i];
                    }

                    bNeedReadback = false;
                }
            }
        }
        else
        {
            getLastCudaError("Kernel execution failed");

            // execute the kernel
            reduceSinglePass(n, numThreads, numBlocks, d_idata, d_odata);

            // check if kernel execution generated an error
            getLastCudaError("Kernel execution failed");
        }

        cudaDeviceSynchronize();
        sdkStopTimer(&timer);
    }

    if (bNeedReadback)
    {
        // copy final sum from device to host
        error = cudaMemcpy(&gpu_result, d_odata, sizeof(float), cudaMemcpyDeviceToHost);
        checkCudaErrors(error);
    }

    return gpu_result;
}

////////////////////////////////////////////////////////////////////////////////
// This function calls benchmarkReduce multiple times for a range of array sizes
// and prints a report in CSV (comma-separated value) format that can be used for
// generating a "shmoo" plot showing the performance for each kernel variation
// over a wide range of input sizes.
////////////////////////////////////////////////////////////////////////////////
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks)
{
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(float);

    float *h_idata = (float *) malloc(bytes);

    for (int i = 0; i < maxN; i++)
    {
        // Keep the numbers small so we don't get truncation error in the sum
        h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX;
    }

    int maxNumBlocks = min(65535, maxN / maxThreads);

    // allocate mem for the result on host side
    float *h_odata = (float *) malloc(maxNumBlocks*sizeof(float));

    // allocate device memory and data
    float *d_idata = NULL;
    float *d_odata = NULL;

    checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
    checkCudaErrors(cudaMalloc((void **) &d_odata, maxNumBlocks*sizeof(float)));

    // copy data directly to device memory
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(float), cudaMemcpyHostToDevice));


    // warm-up
    reduce(maxN, maxThreads, maxNumBlocks, d_idata, d_odata);
    int testIterations = 100;

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);

    // print headers
    printf("N, %d blocks one pass, %d blocks multipass\n", maxBlocks, maxBlocks);

    for (int i = minN; i <= maxN; i *= 2)
    {
        printf("%d, ", i);

        for (int multiPass = 0; multiPass <= 1; multiPass++)
        {
            sdkResetTimer(&timer);
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(i, maxBlocks, maxThreads, numBlocks, numThreads);


            benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks,
                            testIterations, multiPass==1, false, 1, timer, h_odata, d_idata, d_odata);

            float reduceTime = sdkGetAverageTimerValue(&timer);
            printf("%f%s", reduceTime, multiPass==0 ? ", " : "\n");
        }
    }

    printf("\n");

    // cleanup
    sdkDeleteTimer(&timer);
    free(h_idata);
    free(h_odata);

    cudaFree(d_idata);
    cudaFree(d_odata);
}

////////////////////////////////////////////////////////////////////////////////
// The main function which runs the reduction test.
////////////////////////////////////////////////////////////////////////////////
bool
runTest(int argc, char **argv)
{
    int size = 1<<20;    // number of elements to reduce
    int maxThreads = 128;  // number of threads per block
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;
    bool multipass = false;
    bool bTestResult = false;

    if (checkCmdLineFlag(argc, (const char **) argv, "n"))
    {
        size       = getCmdLineArgumentInt(argc, (const char **)argv, "n");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "threads"))
    {
        maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads");
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks"))
    {
        maxBlocks  = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks");
    }

    printf("%d elements\n", size);
    printf("%d threads (max)\n", maxThreads);

    cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal");
    multipass         = checkCmdLineFlag(argc, (const char **) argv, "multipass");

    if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh"))
    {
        cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh");
    }

    bool runShmoo     = checkCmdLineFlag(argc, (const char **) argv, "shmoo");

    if (runShmoo)
    {
        shmoo(1, 33554432, maxThreads, maxBlocks);
    }
    else
    {
        // create random input data on CPU
        unsigned int bytes = size * sizeof(float);

        float *h_idata = (float *) malloc(bytes);

        for (int i=0; i<size; i++)
        {
            // Keep the numbers small so we don't get truncation error in the sum
            h_idata[i] = (rand() & 0xFF) / (float)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(size, maxBlocks, maxThreads, numBlocks, numThreads);

        if (numBlocks == 1)
        {
            cpuFinalThreshold = 1;
        }

        // allocate mem for the result on host side
        float *h_odata = (float *) malloc(numBlocks*sizeof(float));

        printf("%d blocks\n", numBlocks);

        // allocate device memory and data
        float *d_idata = NULL;
        float *d_odata = NULL;

        checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
        checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(float)));

        // copy data directly to device memory
        checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(float), cudaMemcpyHostToDevice));

        // warm-up
        reduce(size, numThreads, numBlocks, d_idata, d_odata);
        int testIterations = 100;

        StopWatchInterface *timer = 0;
        sdkCreateTimer(&timer);

        float gpu_result = 0;

        gpu_result = benchmarkReduce(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                     testIterations, multipass, cpuFinalReduction,
                                     cpuFinalThreshold, timer, h_odata, d_idata, d_odata);

        float reduceTime = sdkGetAverageTimerValue(&timer);
        printf("Average time: %f ms\n", reduceTime);
        printf("Bandwidth:    %f GB/s\n\n", (size * sizeof(int)) / (reduceTime * 1.0e6));

        // compute reference solution
        float cpu_result = reduceCPU<float>(h_idata, size);

        printf("GPU result = %0.12f\n", gpu_result);
        printf("CPU result = %0.12f\n", cpu_result);

        double threshold = 1e-8 * size;
        double diff = abs((double)gpu_result - (double)cpu_result);
        bTestResult = (diff < threshold);

        // cleanup
        sdkDeleteTimer(&timer);

        free(h_idata);
        free(h_odata);
        cudaFree(d_idata);
        cudaFree(d_odata);
    }

    return bTestResult;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>

#include "FunctionPointers_kernels.h"

// Texture reference for reading image
texture<unsigned char, 2> tex;
extern __shared__ unsigned char LocalBlock[];
static cudaArray *array = NULL;

#define RADIUS 1

// pixel value used for thresholding function, works well with sample image 'lena'
#define THRESHOLD 150.0f

#ifdef FIXED_BLOCKWIDTH
#define BlockWidth 80
#define SharedPitch 384
#endif

// A function pointer can be declared explicitly like this line:
//__device__ unsigned char (*pointFunction)(unsigned char, float ) = NULL;
// or by using typedef's like below:

typedef unsigned char(*blockFunction_t)(
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    float);

typedef unsigned char(*pointFunction_t)(
    unsigned char, float);

__device__ blockFunction_t blockFunction;

__device__ unsigned char
ComputeSobel(unsigned char ul, // upper left
             unsigned char um, // upper middle
             unsigned char ur, // upper right
             unsigned char ml, // middle left
             unsigned char mm, // middle (unused)
             unsigned char mr, // middle right
             unsigned char ll, // lower left
             unsigned char lm, // lower middle
             unsigned char lr, // lower right
             float fScale)
{
    short Horz = ur + 2*mr + lr - ul - 2*ml - ll;
    short Vert = ul + 2*um + ur - ll - 2*lm - lr;
    short Sum = (short)(fScale*(abs((int)Horz)+abs((int)Vert)));
    return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)) ;
}

// define a function pointer and initialize to NULL
__device__ unsigned char(*varFunction)(
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    unsigned char, unsigned char, unsigned char,
    float x
) = NULL;

__device__ unsigned char
ComputeBox(unsigned char ul,   // upper left
           unsigned char um, // upper middle
           unsigned char ur, // upper right
           unsigned char ml, // middle left
           unsigned char mm, // middle...middle
           unsigned char mr, // middle right
           unsigned char ll, // lower left
           unsigned char lm, // lower middle
           unsigned char lr, // lower right
           float fscale
          )
{

    short Sum = (short)(ul+um+ur + ml+mm+mr + ll+lm+lr)/9;
    Sum *= fscale;
    return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum)) ;
}
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
    if (in > thresh)
    {
        return 0xFF;
    }
    else
    {
        return 0;
    }
}


// Declare function tables, one for the point function chosen, one for the
// block function chosen.  The number of entries is determined by the
// enum in FunctionPointers_kernels.h
__device__ blockFunction_t blockFunction_table[LAST_BLOCK_FILTER];
__device__ pointFunction_t pointFunction_table[LAST_POINT_FILTER];

// Declare device side function pointers.  We retrieve them later with
// cudaMemcpyFromSymbol to set our function tables above in some
// particular order specified at runtime.
__device__ blockFunction_t pComputeSobel = ComputeSobel;
__device__ blockFunction_t pComputeBox   = ComputeBox;
__device__ pointFunction_t pComputeThreshold = Threshold;

// Allocate host side tables to mirror the device side, and later, we
// fill these tables with the function pointers.  This lets us send
// the pointers to the kernel on invocation, as a method of choosing
// which function to run.
blockFunction_t h_blockFunction_table[2];
pointFunction_t h_pointFunction_table[2];


// Perform a filter operation on the data, using shared memory
// The actual operation performed is
// determined by the function pointer "blockFunction" and selected
// by the integer argument "blockOperation" and has access
// to an apron around the current pixel being processed.
// Following the block operation, a per-pixel operation,
// pointed to by pPointFunction is performed before the final
// pixel is produced.
__global__ void
SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch,
#ifndef FIXED_BLOCKWIDTH
            short BlockWidth, short SharedPitch,
#endif
            short w, short h, float fScale,
            int blockOperation, pointFunction_t pPointFunction
           )
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    short u = 4*blockIdx.x*BlockWidth;
    short v = blockIdx.y*blockDim.y + threadIdx.y;
    short ib;

    int SharedIdx = threadIdx.y * SharedPitch;

    for (ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x)
    {
        LocalBlock[SharedIdx+4*ib+0] = tex2D(tex,
                                             (float)(u+4*ib-RADIUS+0), (float)(v-RADIUS));
        LocalBlock[SharedIdx+4*ib+1] = tex2D(tex,
                                             (float)(u+4*ib-RADIUS+1), (float)(v-RADIUS));
        LocalBlock[SharedIdx+4*ib+2] = tex2D(tex,
                                             (float)(u+4*ib-RADIUS+2), (float)(v-RADIUS));
        LocalBlock[SharedIdx+4*ib+3] = tex2D(tex,
                                             (float)(u+4*ib-RADIUS+3), (float)(v-RADIUS));
    }

    if (threadIdx.y < RADIUS*2)
    {
        //
        // copy trailing RADIUS*2 rows of pixels into shared
        //
        SharedIdx = (blockDim.y+threadIdx.y) * SharedPitch;

        for (ib = threadIdx.x; ib < BlockWidth+2*RADIUS; ib += blockDim.x)
        {
            LocalBlock[SharedIdx+4*ib+0] = tex2D(tex,
                                                 (float)(u+4*ib-RADIUS+0), (float)(v+blockDim.y-RADIUS));
            LocalBlock[SharedIdx+4*ib+1] = tex2D(tex,
                                                 (float)(u+4*ib-RADIUS+1), (float)(v+blockDim.y-RADIUS));
            LocalBlock[SharedIdx+4*ib+2] = tex2D(tex,
                                                 (float)(u+4*ib-RADIUS+2), (float)(v+blockDim.y-RADIUS));
            LocalBlock[SharedIdx+4*ib+3] = tex2D(tex,
                                                 (float)(u+4*ib-RADIUS+3), (float)(v+blockDim.y-RADIUS));
        }
    }

    cg::sync(cta);

    u >>= 2;    // index as uchar4 from here
    uchar4 *pSobel = (uchar4 *)(((char *) pSobelOriginal)+v*SobelPitch);
    SharedIdx = threadIdx.y * SharedPitch;

    blockFunction = blockFunction_table[blockOperation];

    for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x)
    {

        uchar4 out;

        unsigned char pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+0];
        unsigned char pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+1];
        unsigned char pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+2];
        unsigned char pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+0];
        unsigned char pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+1];
        unsigned char pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+2];
        unsigned char pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+0];
        unsigned char pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+1];
        unsigned char pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+2];

        out.x = (*blockFunction)(pix00, pix01, pix02,
                                 pix10, pix11, pix12,
                                 pix20, pix21, pix22, fScale);

        pix00 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+3];
        pix10 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+3];
        pix20 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+3];
        out.y = (*blockFunction)(pix01, pix02, pix00,
                                 pix11, pix12, pix10,
                                 pix21, pix22, pix20, fScale);

        pix01 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+4];
        pix11 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+4];
        pix21 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+4];
        out.z = (*blockFunction)(pix02, pix00, pix01,
                                 pix12, pix10, pix11,
                                 pix22, pix20, pix21, fScale);

        pix02 = LocalBlock[SharedIdx+4*ib+0*SharedPitch+5];
        pix12 = LocalBlock[SharedIdx+4*ib+1*SharedPitch+5];
        pix22 = LocalBlock[SharedIdx+4*ib+2*SharedPitch+5];
        out.w = (*blockFunction)(pix00, pix01, pix02,
                                 pix10, pix11, pix12,
                                 pix20, pix21, pix22, fScale);

        if (pPointFunction != NULL)
        {
            out.x = (*pPointFunction)(out.x, THRESHOLD);
            out.y = (*pPointFunction)(out.y, THRESHOLD);
            out.z = (*pPointFunction)(out.z, THRESHOLD);
            out.w = (*pPointFunction)(out.w, THRESHOLD);
        }

        if (u+ib < w/4 && v < h)
        {
            pSobel[u+ib] = out;
        }

    }

    cg::sync(cta);
}

__global__ void
SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch,
               int w, int h, float fscale)
{
    unsigned char *pSobel =
        (unsigned char *)(((char *) pSobelOriginal)+blockIdx.x*Pitch);

    for (int i = threadIdx.x; i < w; i += blockDim.x)
    {
        pSobel[i] = min(max((tex2D(tex, (float) i, (float) blockIdx.x) * fscale), 0.f), 255.f);
    }
}

// Perform block and pointer filtering using texture lookups.
// The block and point operations are determined by the
// input argument (see comment above for "SobelShared" function)
__global__ void
SobelTex(Pixel *pSobelOriginal, unsigned int Pitch,
         int w, int h, float fScale, int blockOperation, pointFunction_t pPointOperation)
{
    unsigned char *pSobel =
        (unsigned char *)(((char *) pSobelOriginal)+blockIdx.x*Pitch);
    unsigned char tmp = 0;

    for (int i = threadIdx.x; i < w; i += blockDim.x)
    {
        unsigned char pix00 = tex2D(tex, (float) i-1, (float) blockIdx.x-1);
        unsigned char pix01 = tex2D(tex, (float) i+0, (float) blockIdx.x-1);
        unsigned char pix02 = tex2D(tex, (float) i+1, (float) blockIdx.x-1);
        unsigned char pix10 = tex2D(tex, (float) i-1, (float) blockIdx.x+0);
        unsigned char pix11 = tex2D(tex, (float) i+0, (float) blockIdx.x+0);
        unsigned char pix12 = tex2D(tex, (float) i+1, (float) blockIdx.x+0);
        unsigned char pix20 = tex2D(tex, (float) i-1, (float) blockIdx.x+1);
        unsigned char pix21 = tex2D(tex, (float) i+0, (float) blockIdx.x+1);
        unsigned char pix22 = tex2D(tex, (float) i+1, (float) blockIdx.x+1);
        tmp = (*(blockFunction_table[blockOperation]))(pix00, pix01, pix02,
                                                       pix10, pix11, pix12,
                                                       pix20, pix21, pix22, fScale);

        if (pPointOperation != NULL)
        {
            tmp = (*pPointOperation)(tmp, 150.0);
        }

        pSobel[i] = tmp;
    }
}

extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp)
{
    cudaChannelFormatDesc desc;

    if (Bpp == 1)
    {
        desc = cudaCreateChannelDesc<unsigned char>();
    }
    else
    {
        desc = cudaCreateChannelDesc<uchar4>();
    }

    checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih));
    checkCudaErrors(cudaMemcpyToArray(array, 0, 0, data, Bpp*sizeof(Pixel)*iw*ih, cudaMemcpyHostToDevice));
}

extern "C" void deleteTexture(void)
{
    checkCudaErrors(cudaFreeArray(array));
}

// Copy the pointers from the function tables to the host side
void setupFunctionTables()
{
    // Dynamically assign the function table.
    // Copy the function pointers to their appropriate locations according to the enum
    checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER], pComputeSobel, sizeof(blockFunction_t)));
    checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER], pComputeBox, sizeof(blockFunction_t)));

    // do the same for the point function, where the 2nd function is NULL ("no-op" filter, skipped in kernel code)
    checkCudaErrors(cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER], pComputeThreshold, sizeof(pointFunction_t)));
    h_pointFunction_table[NULL_FILTER] = NULL;

    // now copy the function tables back to the device, so if we wish we can use an index into the table to choose them
    // We have now set the order in the function table according to our enum.
    checkCudaErrors(cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table, sizeof(blockFunction_t)*LAST_BLOCK_FILTER));
    checkCudaErrors(cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table, sizeof(pointFunction_t)*LAST_POINT_FILTER));
}


// Wrapper for the __global__ call that sets up the texture and threads
// Below two methods for selecting the image processing function to run are shown.
// BlockOperation is an integer kernel argument used as an index into the blockFunction_table on the device side
// pPointOp is itself a function pointer passed as a kernel argument, retrieved from a host side copy of the function table
extern "C" void sobelFilter(Pixel *odata, int iw, int ih, enum SobelDisplayMode mode, float fScale, int blockOperation, int pointOperation)
{
    checkCudaErrors(cudaBindTextureToArray(tex, array));
    pointFunction_t pPointOp = h_pointFunction_table[pointOperation];

    switch (mode)
    {
        case SOBELDISPLAY_IMAGE:
            SobelCopyImage<<<ih, 384>>>(odata, iw, iw, ih, fScale);
            break;

        case SOBELDISPLAY_SOBELTEX:
            SobelTex<<<ih, 384>>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp);
            break;

        case SOBELDISPLAY_SOBELSHARED:
            {
                dim3 threads(16,4);
#ifndef FIXED_BLOCKWIDTH
                int BlockWidth = 80; // must be divisible by 16 for coalescing
#endif
                dim3 blocks = dim3(iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)),
                                   ih/threads.y+(0!=ih%threads.y));
                int SharedPitch = ~0x3f&(4*(BlockWidth+2*RADIUS)+0x3f);
                int sharedMem = SharedPitch*(threads.y+2*RADIUS);

                // for the shared kernel, width must be divisible by 4
                iw &= ~3;

                SobelShared<<<blocks, threads, sharedMem>>>((uchar4 *) odata,
                                                            iw,
#ifndef FIXED_BLOCKWIDTH
                                                            BlockWidth, SharedPitch,
#endif
                                                            iw, ih, fScale, blockOperation, pPointOp);
            }
            break;
    }

    checkCudaErrors(cudaUnbindTexture(tex));
}
/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
//
// This sample demonstrates dynamic global memory allocation through device C++ new and delete operators and virtual function declarations available with CUDA 4.0.

#include <stdio.h>

#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>

#include <stdlib.h>

#include <vector>
#include <algorithm>

const char *sSDKsample = "newdelete";

#include "container.hpp"

/////////////////////////////////////////////////////////////////////////////
//
// Kernels to allocate and instantiate Container objects on the device heap
//
////////////////////////////////////////////////////////////////////////////

__global__
void vectorCreate(Container<int> **g_container, int max_size)
{
    // The Vector object and the data storage are allocated in device heap memory.
    // This makes it persistent for the lifetime of the CUDA context.
    // The grid has only one thread as only a single object instance is needed.

    *g_container = new Vector<int>(max_size);
}


/////////////////////////////////////////////////////////////////////////////
//
// Kernels to fill and consume shared Container objects.
//
////////////////////////////////////////////////////////////////////////////


__global__
void containerFill(Container<int> **g_container)
{
    // All threads of the grid cooperatively populate the shared Container object with data.
    if (threadIdx.x == 0)
    {
        (*g_container)->push(blockIdx.x);
    }
}

__global__
void containerConsume(Container<int> **g_container, int *d_result)
{
    // All threads of the grid cooperatively consume the data from the shared Container object.
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    int v;

    if ((*g_container)->pop(v))
    {
        d_result[idx] = v;
    }
    else
    {
        d_result[idx] = -1;
    }
}


/////////////////////////////////////////////////////////////////////////////
//
// Kernel to delete shared Container objects.
//
////////////////////////////////////////////////////////////////////////////


__global__
void containerDelete(Container<int> **g_container)
{
    delete *g_container;
}


///////////////////////////////////////////////////////////////////////////////////////////
//
// Kernels to using of placement new to put shared Vector objects and data in shared memory
//
///////////////////////////////////////////////////////////////////////////////////////////


__global__
void placementNew(int *d_result)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ unsigned char  __align__(8) s_buffer[sizeof(Vector<int>)];
    __shared__ int __align__(8) s_data[1024];
    __shared__ Vector<int> *s_vector;

    // The first thread of the block initializes the shared Vector object.
    // The placement new operator enables the Vector object and the data array top be placed in shared memory.
    if (threadIdx.x == 0)
    {
        s_vector = new(s_buffer) Vector<int>(1024, s_data);
    }

    cg::sync(cta);

    if ((threadIdx.x & 1) == 0)
    {
        s_vector->push(threadIdx.x >> 1);
    }

    // Need to sync as the vector implementation does not support concurrent push/pop operations.
    cg::sync(cta);

    int v;

    if (s_vector->pop(v))
    {
        d_result[threadIdx.x] = v;
    }
    else
    {
        d_result[threadIdx.x] = -1;
    }

    // Note: deleting objects placed in shared memory is not necessary (lifetime of shared memory is that of the block)
}


struct ComplexType_t
{
    int a;
    int b;
    float c;
    float d;
};


__global__
void complexVector(int *d_result)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector<ComplexType_t>)];
    __shared__ ComplexType_t __align__(8) s_data[1024];
    __shared__ Vector<ComplexType_t> *s_vector;

    // The first thread of the block initializes the shared Vector object.
    // The placement new operator enables the Vector object and the data array top be placed in shared memory.
    if (threadIdx.x == 0)
    {
        s_vector = new(s_buffer) Vector<ComplexType_t>(1024, s_data);
    }

    cg::sync(cta);

    if ((threadIdx.x & 1) == 0)
    {
        ComplexType_t data;
        data.a = threadIdx.x >> 1;
        data.b = blockIdx.x;
        data.c = threadIdx.x / (float)(blockDim.x);
        data.d = blockIdx.x / (float)(gridDim.x);

        s_vector->push(data);
    }

    cg::sync(cta);

    ComplexType_t v;

    if (s_vector->pop(v))
    {
        d_result[threadIdx.x] = v.a;
    }
    else
    {
        d_result[threadIdx.x] = -1;
    }

    // Note: deleting objects placed in shared memory is not necessary (lifetime of shared memory is that of the block)
}


///////////////////////////////////////////////////////////////////////////////////////////
//
// Host code
//
///////////////////////////////////////////////////////////////////////////////////////////

bool checkResult(int *d_result, int N)
{
    std::vector<int> h_result;
    h_result.resize(N);


    checkCudaErrors(cudaMemcpy(&h_result[0], d_result, N*sizeof(int), cudaMemcpyDeviceToHost));
    std::sort(h_result.begin(), h_result.end());

    bool success = true;
    bool test = false;

    int value=0;

    for (int i=0; i < N; ++i)
    {
        if (h_result[i] != -1)
        {
            test = true;
        }

        if (test && (value++) != h_result[i])
        {
            success = false;
        }
    }

    return success;
}


bool testContainer(Container<int> **d_container, int blocks, int threads)
{
    int *d_result;
    cudaMalloc(&d_result, blocks*threads*sizeof(int));

    containerFill<<<blocks,threads>>>(d_container);
    containerConsume<<<blocks,threads>>>(d_container, d_result);
    containerDelete<<<1,1>>>(d_container);
    checkCudaErrors(cudaDeviceSynchronize());

    bool success = checkResult(d_result, blocks*threads);

    cudaFree(d_result);

    return success;
}


bool testPlacementNew(int threads)
{
    int *d_result;
    cudaMalloc(&d_result, threads*sizeof(int));

    placementNew<<<1, threads>>>(d_result);
    checkCudaErrors(cudaDeviceSynchronize());

    bool success = checkResult(d_result, threads);

    cudaFree(d_result);

    return success;
}

bool testComplexType(int threads)
{
    int *d_result;
    cudaMalloc(&d_result, threads*sizeof(int));

    complexVector<<<1, threads>>>(d_result);
    checkCudaErrors(cudaDeviceSynchronize());

    bool success = checkResult(d_result, threads);

    cudaFree(d_result);

    return success;
}

///////////////////////////////////////////////////////////////////////////////////////////
//
// MAIN
//
///////////////////////////////////////////////////////////////////////////////////////////


int main(int argc, char **argv)
{
    int cuda_device = 0;

    printf("%s Starting...\n\n", sSDKsample);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cuda_device = findCudaDevice(argc, (const char **)argv);

    // set the heap size for device size new/delete to 128 MB
    checkCudaErrors(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * (1 << 20)));

    Container<int> **d_container;
    checkCudaErrors(cudaMalloc(&d_container, sizeof(Container<int> **)));

    bool bTest = false;
    int test_passed = 0;

    printf(" > Container = Vector test ");
    vectorCreate<<<1,1>>>(d_container, 128 * 128);
    bTest = testContainer(d_container, 128, 128);
    printf(bTest ? "OK\n\n" : "NOT OK\n\n");
    test_passed += (bTest ? 1 : 0);

    checkCudaErrors(cudaFree(d_container));

    printf(" > Container = Vector, using placement new on SMEM buffer test ");
    bTest = testPlacementNew(1024);
    printf(bTest ? "OK\n\n" : "NOT OK\n\n");
    test_passed += (bTest ? 1 : 0);

    printf(" > Container = Vector, with user defined datatype test ");
    bTest = testComplexType(1024);
    printf(bTest ? "OK\n\n" : "NOT OK\n\n");
    test_passed += (bTest ? 1 : 0);

    printf("Test Summary: %d/3 succesfully run\n", test_passed);

    exit(test_passed==3 ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This is a simple test showing huge access speed gap
 * between aligned and misaligned structures
 * (those having/missing __align__ keyword).
 * It measures per-element copy throughput for
 * aligned and misaligned structures on
 * big chunks of data.
 */


// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <helper_functions.h> // helper utility functions
#include <helper_cuda.h>      // helper functions for CUDA error checking and initialization

////////////////////////////////////////////////////////////////////////////////
// Misaligned types
////////////////////////////////////////////////////////////////////////////////
typedef unsigned char uint8;

typedef unsigned short int uint16;

typedef struct
{
    unsigned char r, g, b, a;
} RGBA8_misaligned;

typedef struct
{
    unsigned int l, a;
} LA32_misaligned;

typedef struct
{
    unsigned int r, g, b;
} RGB32_misaligned;

typedef struct
{
    unsigned int r, g, b, a;
} RGBA32_misaligned;


////////////////////////////////////////////////////////////////////////////////
// Aligned types
////////////////////////////////////////////////////////////////////////////////
typedef struct __align__(4)
{
    unsigned char r, g, b, a;
}
RGBA8;

typedef unsigned int I32;

typedef struct __align__(8)
{
    unsigned int l, a;
}
LA32;

typedef struct __align__(16)
{
    unsigned int r, g, b;
}
RGB32;

typedef struct __align__(16)
{
    unsigned int r, g, b, a;
}
RGBA32;


////////////////////////////////////////////////////////////////////////////////
// Because G80 class hardware natively supports global memory operations
// only with data elements of 4, 8 and 16 bytes, if structure size
// exceeds 16 bytes, it can't be efficiently read or written,
// since more than one global memory non-coalescable load/store instructions
// will be generated, even if __align__ option is supplied.
// "Structure of arrays" storage strategy offers best performance
// in general case. See section 5.1.2 of the Programming Guide.
////////////////////////////////////////////////////////////////////////////////
typedef struct __align__(16)
{
    RGBA32 c1, c2;
}
RGBA32_2;


////////////////////////////////////////////////////////////////////////////////
// Common host and device functions
////////////////////////////////////////////////////////////////////////////////
//Round a / b to nearest higher integer value
int iDivUp(int a, int b)
{
    return (a % b != 0) ? (a / b + 1) : (a / b);
}

//Round a / b to nearest lower integer value
int iDivDown(int a, int b)
{
    return a / b;
}

//Align a to nearest higher multiple of b
int iAlignUp(int a, int b)
{
    return (a % b != 0) ? (a - a % b + b) : a;
}

//Align a to nearest lower multiple of b
int iAlignDown(int a, int b)
{
    return a - a % b;
}


////////////////////////////////////////////////////////////////////////////////
// Simple CUDA kernel.
// Copy is carried out on per-element basis,
// so it's not per-byte in case of padded structures.
////////////////////////////////////////////////////////////////////////////////
template<class TData> __global__ void testKernel(
    TData *d_odata,
    TData *d_idata,
    int numElements
)
{
    const int        tid = blockDim.x * blockIdx.x + threadIdx.x;
    const int numThreads = blockDim.x * gridDim.x;

    for (int pos = tid; pos < numElements; pos += numThreads)
    {
        d_odata[pos] = d_idata[pos];
    }
}


////////////////////////////////////////////////////////////////////////////////
// Validation routine for simple copy kernel.
// We must know "packed" size of TData (number_of_fields * sizeof(simple_type))
// and compare only these "packed" parts of the structure,
// containing actual user data. The compiler behavior with padding bytes
// is undefined, since padding is merely a placeholder
// and doesn't contain any user data.
////////////////////////////////////////////////////////////////////////////////
template<class TData> int testCPU(
    TData *h_odata,
    TData *h_idata,
    int numElements,
    int packedElementSize
)
{
    for (int pos = 0; pos < numElements; pos++)
    {
        TData src = h_idata[pos];
        TData dst = h_odata[pos];

        for (int i = 0; i < packedElementSize; i++)
            if (((char *)&src)[i] != ((char *)&dst)[i])
            {
                return 0;
            }
    }

    return 1;
}


////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
//Memory chunk size in bytes. Reused for test
const int       MEM_SIZE = 50000000;
const int NUM_ITERATIONS = 32;

//GPU input and output data
unsigned char *d_idata, *d_odata;
//CPU input data and instance of GPU output data
unsigned char *h_idataCPU, *h_odataGPU;
StopWatchInterface *hTimer = NULL;


template<class TData> int runTest(int packedElementSize, int memory_size)
{
    const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData));
    const int         numElements = iDivDown(memory_size, sizeof(TData));

    //Clean output buffer before current test
    checkCudaErrors(cudaMemset(d_odata, 0, memory_size));
    //Run test
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (int i = 0; i < NUM_ITERATIONS; i++)
    {
        testKernel<TData><<<64, 256>>>(
            (TData *)d_odata,
            (TData *)d_idata,
            numElements
        );
        getLastCudaError("testKernel() execution failed\n");
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
    printf(
        "Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime,
        (double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0)
    );

    //Read back GPU results and run validation
    checkCudaErrors(cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost));
    int flag = testCPU(
                   (TData *)h_odataGPU,
                   (TData *)h_idataCPU,
                   numElements,
                   packedElementSize
               );

    printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n");

    return !flag;
}

int main(int argc, char **argv)
{
    int i, nTotalFailures = 0;

    int devID;
    cudaDeviceProp deviceProp;
    printf("[%s] - Starting...\n", argv[0]);

    // find first CUDA device
    devID = findCudaDevice(argc, (const char **)argv);

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
           deviceProp.name, deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

    // Anything that is less than 192 Cores will have a scaled down workload
    float scale_factor = max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);

    int   MemorySize = (int)(MEM_SIZE/scale_factor) & 0xffffff00; // force multiple of 256 bytes

    printf("> Compute scaling value = %4.2f\n", scale_factor);
    printf("> Memory Size = %d\n", MemorySize);

    sdkCreateTimer(&hTimer);

    printf("Allocating memory...\n");
    h_idataCPU = (unsigned char *)malloc(MemorySize);
    h_odataGPU = (unsigned char *)malloc(MemorySize);
    checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize));
    checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize));

    printf("Generating host input data array...\n");

    for (i = 0; i < MemorySize; i++)
    {
        h_idataCPU[i] = (i & 0xFF) + 1;
    }

    printf("Uploading input data to GPU memory...\n");
    checkCudaErrors(cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice));

    printf("Testing misaligned types...\n");
    printf("uint8...\n");
    nTotalFailures += runTest<uint8>(1, MemorySize);

    printf("uint16...\n");
    nTotalFailures += runTest<uint16>(2, MemorySize);

    printf("RGBA8_misaligned...\n");
    nTotalFailures += runTest<RGBA8_misaligned>(4, MemorySize);

    printf("LA32_misaligned...\n");
    nTotalFailures += runTest<LA32_misaligned>(8, MemorySize);

    printf("RGB32_misaligned...\n");
    nTotalFailures += runTest<RGB32_misaligned>(12, MemorySize);

    printf("RGBA32_misaligned...\n");
    nTotalFailures += runTest<RGBA32_misaligned>(16, MemorySize);

    printf("Testing aligned types...\n");
    printf("RGBA8...\n");
    nTotalFailures += runTest<RGBA8>(4, MemorySize);

    printf("I32...\n");
    nTotalFailures += runTest<I32>(4, MemorySize);

    printf("LA32...\n");
    nTotalFailures += runTest<LA32>(8, MemorySize);

    printf("RGB32...\n");
    nTotalFailures += runTest<RGB32>(12, MemorySize);

    printf("RGBA32...\n");
    nTotalFailures += runTest<RGBA32>(16, MemorySize);

    printf("RGBA32_2...\n");
    nTotalFailures += runTest<RGBA32_2>(32, MemorySize);

    printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures);

    printf("Shutting down...\n");
    checkCudaErrors(cudaFree(d_idata));
    checkCudaErrors(cudaFree(d_odata));
    free(h_odataGPU);
    free(h_idataCPU);

    sdkDeleteTimer(&hTimer);

    if (nTotalFailures != 0)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

//
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
// introduce dependencies between CUDA streams with the cudaStreamWaitEvent function.
//

// Devices of compute capability 2.0 or higher can overlap the kernels
//
#include <stdio.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_functions.h>
#include <helper_cuda.h>

// This is a kernel that does no real work but runs at least for a specified number of clocks
__global__ void clock_block(clock_t *d_o, clock_t clock_count)
{
    unsigned int start_clock = (unsigned int) clock();

    clock_t clock_offset = 0;

    while (clock_offset < clock_count)
    {
        unsigned int end_clock = (unsigned int) clock();

        // The code below should work like
        // this (thanks to modular arithmetics):
        //
        // clock_offset = (clock_t) (end_clock > start_clock ?
        //                           end_clock - start_clock :
        //                           end_clock + (0xffffffffu - start_clock));
        //
        // Indeed, let m = 2^32 then
        // end - start = end + m - start (mod m).

        clock_offset = (clock_t)(end_clock - start_clock);
    }

    d_o[0] = clock_offset;
}


// Single warp reduction kernel
__global__ void sum(clock_t *d_clocks, int N)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ clock_t s_clocks[32];

    clock_t my_sum = 0;

    for (int i = threadIdx.x; i < N; i+= blockDim.x)
    {
        my_sum += d_clocks[i];
    }

    s_clocks[threadIdx.x] = my_sum;
    cg::sync(cta);

    for (int i=16; i>0; i/=2)
    {
        if (threadIdx.x < i)
        {
            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
        }

        cg::sync(cta);
    }

    d_clocks[0] = s_clocks[0];
}

int main(int argc, char **argv)
{
    int nkernels = 8;               // number of concurrent kernels
    int nstreams = nkernels + 1;    // use one more stream than concurrent kernel
    int nbytes = nkernels * sizeof(clock_t);   // number of data bytes
    float kernel_time = 10; // time the kernel should run in ms
    float elapsed_time;   // timing variables
    int cuda_device = 0;

    printf("[%s] - Starting...\n", argv[0]);

    // get number of kernels if overridden on the command line
    if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
    {
        nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
        nstreams = nkernels + 1;
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    cuda_device = findCudaDevice(argc, (const char **)argv);

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDevice(&cuda_device));

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

    if ((deviceProp.concurrentKernels == 0))
    {
        printf("> GPU does not support concurrent kernel execution\n");
        printf("  CUDA kernel runs will be serialized\n");
    }

    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // allocate host memory
    clock_t *a = 0;                     // pointer to the array data in host memory
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));

    // allocate device memory
    clock_t *d_a = 0;             // pointers to data and init value in the device memory
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));

    // allocate and initialize an array of stream handles
    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

    for (int i = 0; i < nstreams; i++)
    {
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
    }

    // create CUDA event handles
    cudaEvent_t start_event, stop_event;
    checkCudaErrors(cudaEventCreate(&start_event));
    checkCudaErrors(cudaEventCreate(&stop_event));


    // the events are used for synchronization only and hence do not need to record timings
    // this also makes events not introduce global sync points when recorded which is critical to get overlap
    cudaEvent_t *kernelEvent;
    kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));

    for (int i = 0; i < nkernels; i++)
    {
        checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
    }

    //////////////////////////////////////////////////////////////////////
    // time execution with nkernels streams
    clock_t total_clocks = 0;
#if defined(__arm__) || defined(__aarch64__)
    // the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
#else
    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif

    cudaEventRecord(start_event, 0);

    // queue nkernels in separate streams and record when they are done
    for (int i=0; i<nkernels; ++i)
    {
        clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
        total_clocks += time_clocks;
        checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));

        // make the last stream wait for the kernel event to be recorded
        checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
    }

    // queue a sum kernel and a copy back to host in the last stream.
    // the commands in this stream get dispatched as soon as all the kernel events have been recorded
    sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
    checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));

    // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel

    // in this sample we just wait until the GPU is done
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));

    printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
    printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
    printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);

    bool bTestResult  = (a[0] > total_clocks);

    // release resources
    for (int i = 0; i < nkernels; i++)
    {
        cudaStreamDestroy(streams[i]);
        cudaEventDestroy(kernelEvent[i]);
    }

    free(streams);
    free(kernelEvent);

    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);
    cudaFreeHost(a);
    cudaFree(d_a);

    if (!bTestResult)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This application demonstrates an approach to the image segmentation
 * trees construction. It is based on Boruvka's MST algorithm.
 * Here's the complete list of references:
 * 1) V. Vineet et al, "Fast Minimum Spanning Tree for
 *    Large Graphs on the GPU";
 * 2) P. Felzenszwalb et al, "Efficient Graph-Based Image Segmentation";
 * 3) A. Ion et al, "Considerations Regarding the Minimum Spanning
 *    Tree Pyramid Segmentation Method".
 */

// System includes.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// STL includes.
#include <iostream>
#include <fstream>
#include <iterator>
#include <vector>
#include <list>
#include <deque>
#include <algorithm>

// Thrust library includes.
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <thrust/reduce.h>
#include <thrust/unique.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/sort.h>
#include <thrust/adjacent_difference.h>
#include <thrust/find.h>

#include <thrust/device_malloc.h>
#include <thrust/device_free.h>

// Sample framework includes.
#include <helper_functions.h>
#include <helper_cuda.h>

// Project includes.
#include "common.cuh"

// Kernels.
#include "kernels.cuh"

using std::cin;
using std::cout;
using std::endl;
using std::vector;
using std::list;
using std::deque;

// Very simple von Neumann middle-square prng.  rand() is different across
// various OS platforms, which makes testing and the output inconsistent.
int myrand(void)
{
    static int seed = 72191;
    char sq[22];

    seed *= seed;
    sprintf(sq, "%010d", seed);
    // pull the middle 5 digits out of sq
    sq[8] = 0;
    seed = atoi(&sq[3]);

    return seed;
}

// Simple memory pool class. It is nothing more than array of fixed-sized
// arrays.
template <typename T>
class DeviceMemoryPool
{
    public:
        // The parameters of the constructor are as follows:
        // 1) uint chunkSize --- size of the particular array;
        // 2) uint chunksCount --- number of fixed-sized arrays.
        DeviceMemoryPool(uint chunkSize, uint chunksCount) :
            chunkSize_(chunkSize)
        {
            chunkRawSize_ = (chunkSize * sizeof(T) + 511) & ~511;

            try
            {
                basePtr_ =
                    thrust::device_malloc(chunkRawSize_ * chunksCount);
            }
            catch (thrust::system_error &e)
            {
                cout << "Pool memory allocation failed (" << e.what() << ")"
                     << endl;
                exit(EXIT_FAILURE);
            }

            for (uint chunkIndex = 0; chunkIndex < chunksCount; ++chunkIndex)
            {
                chunks_.push_back(
                    thrust::device_ptr<T>(
                        reinterpret_cast<T *>(
                            static_cast<char *>(basePtr_.get()) +
                            chunkRawSize_ * chunkIndex)));
            }
        }

        ~DeviceMemoryPool()
        {
            try
            {
                thrust::device_free(basePtr_);
            }
            catch (thrust::system_error &e)
            {
                cout << "Pool memory allocation failed (" << e.what() << ")"
                     << endl;
                exit(EXIT_FAILURE);
            }
        }

        // Returns an address of the first available array
        // in the memory pool.
        thrust::device_ptr<T> get()
        {
            thrust::device_ptr<T> ptr(chunks_.back());
            chunks_.pop_back();

            return ptr;
        }

        // Pushes an address stored in "ptr" to the list
        // of available arrays of the memory pool.
        // It should be noted that it is user who is responsible for returning
        // the previously requested memory to the appropriate pool.
        inline void put(const thrust::device_ptr<T> &ptr)
        {
            chunks_.push_back(ptr);
        }

        uint totalFreeChunks() const
        {
            return chunks_.size();
        }

    private:
        uint chunkSize_, chunkRawSize_;
        thrust::device_ptr<void> basePtr_;

        list< thrust::device_ptr<T> > chunks_;
};

// Graph structure.
struct Graph
{
    Graph() {}

    Graph(uint verticesCount, uint edgesCount) :
        vertices(verticesCount),
        edges(edgesCount),
        weights(edgesCount)
    {}

    // This vector stores offsets for each vertex in "edges" and "weights"
    // vectors. For example:
    // "vertices[0]" is an index of the first outgoing edge of vertex #0,
    // "vertices[1]" is an index of the first outgoing edge of vertex #1, etc.
    vector<uint> vertices;

    // This vector stores indices of endpoints of the corresponding edges.
    // For example, "edges[vertices[0]]" is the first neighbouring vertex
    // of vertex #0.
    vector<uint> edges;

    // This vector stores weights of the corresponding edges.
    vector<float> weights;
};

// Simple segmentation tree class.
// Each level of the tree corresponds to the segmentation.
// See "Level" class for the details.
class Pyramid
{
    public:
        void addLevel(uint totalSuperNodes,
                      uint totalNodes,
                      thrust::device_ptr<uint> superVerticesOffsets,
                      thrust::device_ptr<uint> verticesIDs)
        {
            levels_.push_back(Level(totalSuperNodes, totalNodes));
            levels_.back().buildFromDeviceData(superVerticesOffsets,
                                               verticesIDs);
        }

        uint levelsCount() const
        {
            return static_cast<uint>(levels_.size());
        }

        void dump(uint width, uint height) const
        {
            char filename[256], format[256];
            uint levelIndex = 0;

            uint requiredDigitsCount =
                static_cast<uint>(log10(static_cast<float>(levelsCount()))) +
                1;
            sprintf(format, "level_%%0%uu.ppm", requiredDigitsCount);

            for (LevelsIterator level = levels_.rbegin();
                 level != levels_.rend();
                 ++level, ++levelIndex)
            {

                sprintf(filename, format, levelIndex);
                dumpLevel(level, width, height, filename);
            }
        }

    private:
        // Level of the segmentation tree.
        class Level
        {
            public:
                Level(uint totalSuperNodes, uint totalNodes) :
                    superNodesOffsets_(totalSuperNodes), nodes_(totalNodes)
                {
                }

                void buildFromDeviceData(
                    thrust::device_ptr<uint> superVerticesOffsets,
                    thrust::device_ptr<uint> verticesIDs)
                {
                    checkCudaErrors(
                        cudaMemcpy(&(superNodesOffsets_[0]),
                                   superVerticesOffsets.get(),
                                   sizeof(uint) * superNodesOffsets_.size(),
                                   cudaMemcpyDeviceToHost));

                    checkCudaErrors(
                        cudaMemcpy(&(nodes_[0]),
                                   verticesIDs.get(),
                                   sizeof(uint) * nodes_.size(),
                                   cudaMemcpyDeviceToHost));
                }

            private:
                friend class Pyramid;

                // The pair of the following vectors describes the
                // relation between the consecutive levels.
                // Consider an example. Let the index of the current level be n.
                // Then nodes of level #(n-1) with indices stored in
                // "nodes[superNodesOffsets_[0]]",
                // "nodes[superNodesOffsets_[0] + 1]",
                // ...,
                // "nodes[superNodesOffsets_[1] - 1]"
                // correspond to vertex #0 of level #n. An so on.
                vector<uint> superNodesOffsets_;
                vector<uint> nodes_;
        };

        typedef list<Level>::const_reverse_iterator LevelsIterator;

        // Dumps level to the file "level_n.ppm" where n
        // is index of the level. Segments are drawn in random colors.
        void dumpLevel(LevelsIterator level,
                       uint width,
                       uint height,
                       const char *filename) const
        {
            deque< std::pair<uint, uint> > nodesQueue;

            uint totalSegments;

            {
                const vector<uint> &superNodesOffsets =
                    level->superNodesOffsets_;
                const vector<uint> &nodes =
                    level->nodes_;

                totalSegments = static_cast<uint>(superNodesOffsets.size());

                for (uint superNodeIndex = 0, nodeIndex = 0;
                     superNodeIndex < superNodesOffsets.size();
                     ++superNodeIndex)
                {

                    uint superNodeEnd =
                        superNodeIndex + 1 < superNodesOffsets.size() ?
                        superNodesOffsets[superNodeIndex + 1] :
                        static_cast<uint>(nodes.size());

                    for (; nodeIndex < superNodeEnd; ++nodeIndex)
                    {
                        nodesQueue.push_back(std::make_pair(nodes[nodeIndex],
                                                            superNodeIndex));
                    }
                }
            }

            ++level;

            while (level != levels_.rend())
            {
                uint superNodesCount = static_cast<uint>(nodesQueue.size());

                const vector<uint> &superNodesOffsets =
                    level->superNodesOffsets_;
                const vector<uint> &nodes =
                    level->nodes_;

                while (superNodesCount--)
                {
                    std::pair<uint, uint> currentNode = nodesQueue.front();
                    nodesQueue.pop_front();

                    uint superNodeBegin = superNodesOffsets[currentNode.first];

                    uint superNodeEnd =
                        currentNode.first + 1 < superNodesOffsets.size() ?
                        superNodesOffsets[currentNode.first + 1] :
                        static_cast<uint>(nodes.size());

                    for (uint nodeIndex = superNodeBegin;
                         nodeIndex < superNodeEnd;
                         ++nodeIndex)
                    {

                        nodesQueue.push_back(
                            std::make_pair(nodes[nodeIndex],
                                           currentNode.second));
                    }
                }

                ++level;
            }

            vector<uint> colors(3 * totalSegments);

            for (uint colorIndex = 0; colorIndex < totalSegments; ++colorIndex)
            {
                colors[colorIndex * 3    ] = myrand() % 256;
                colors[colorIndex * 3 + 1] = myrand() % 256;
                colors[colorIndex * 3 + 2] = myrand() % 256;
            }

            uchar *image = new uchar[width * height * 3];

            while (!nodesQueue.empty())
            {
                std::pair<uint, uint> currentNode = nodesQueue.front();
                nodesQueue.pop_front();

                uint pixelIndex = currentNode.first;
                uint pixelSegment = currentNode.second;

                image[pixelIndex * 3    ] = colors[pixelSegment * 3    ];
                image[pixelIndex * 3 + 1] = colors[pixelSegment * 3 + 1];
                image[pixelIndex * 3 + 2] = colors[pixelSegment * 3 + 2];
            }

            __savePPM(filename, image, width, height, 3);

            delete[] image;
        }

        list<Level> levels_;
};

// The class that encapsulates the main algorithm.
class SegmentationTreeBuilder
{
    public:
        SegmentationTreeBuilder():verticesCount_(0),edgesCount_(0)  {}

        ~SegmentationTreeBuilder() {}

        // Repeatedly invokes the step of the algorithm
        // until the limiting segmentation is found.
        // Returns time (in ms) spent on building the tree.
        float run(const Graph &graph, Pyramid &segmentations)
        {
            cudaEvent_t start, stop;

            cudaEventCreate(&start);
            cudaEventCreate(&stop);

            cudaEventRecord(start, 0);

            // Allocate required memory pools. We need just 4 types of arrays.
            MemoryPoolsCollection pools =
            {
                DeviceMemoryPool<uint>(
                    static_cast<uint>(graph.vertices.size()),
                    kUintVerticesPoolsRequired),
                DeviceMemoryPool<float>(
                    static_cast<uint>(graph.vertices.size()),
                    kFloatVerticesPoolsRequired),
                DeviceMemoryPool<uint>(
                    static_cast<uint>(graph.edges.size()),
                    kUintEdgesPoolsRequired),
                DeviceMemoryPool<float>(
                    static_cast<uint>(graph.edges.size()),
                    kFloatEdgesPoolsRequired)
            };

            // Initialize internal variables
            try
            {
                initalizeData(graph, pools);
            }
            catch (thrust::system_error &e)
            {
                cout << "Initialization failed (" << e.what() << ")" << endl;
                exit(EXIT_FAILURE);
            }

            // Run steps
            AlgorithmStatus status;

            try
            {
                do
                {
                    status = invokeStep(pools, segmentations);
                }
                while (status != ALGORITHM_FINISHED);
            }
            catch (thrust::system_error &e)
            {
                cout << "Algorithm failed (" << e.what() << ")" << endl;
                exit(EXIT_FAILURE);
            }

            cudaEventRecord(stop, 0);
            cudaEventSynchronize(stop);

            float elapsedTime;
            cudaEventElapsedTime(&elapsedTime, start, stop);

            return elapsedTime;
        }

    private:
        void printMemoryUsage()
        {
            size_t availableMemory, totalMemory, usedMemory;

            cudaMemGetInfo(&availableMemory, &totalMemory);
            usedMemory = totalMemory - availableMemory;

            cout << "Device memory: used " << usedMemory
                 << " available " << availableMemory
                 << " total " << totalMemory << endl;
        }

        struct MemoryPoolsCollection
        {
            DeviceMemoryPool<uint> uintVertices;
            DeviceMemoryPool<float> floatVertices;
            DeviceMemoryPool<uint> uintEdges;
            DeviceMemoryPool<float> floatEdges;
        };

        static const uint kUintVerticesPoolsRequired = 8;
        static const uint kFloatVerticesPoolsRequired = 3;
        static const uint kUintEdgesPoolsRequired = 8;
        static const uint kFloatEdgesPoolsRequired = 4;

        void initalizeData(const Graph &graph, MemoryPoolsCollection &pools)
        {
            // Get memory for the internal variables
            verticesCount_ = static_cast<uint>(graph.vertices.size());
            edgesCount_ = static_cast<uint>(graph.edges.size());

            dVertices_ = pools.uintVertices.get();
            dEdges_ = pools.uintEdges.get();
            dWeights_ = pools.floatEdges.get();

            dOutputEdgesFlags_ = pools.uintEdges.get();

            // Copy graph to the device memory
            checkCudaErrors(cudaMemcpy(dVertices_.get(),
                                       &(graph.vertices[0]),
                                       sizeof(uint) * verticesCount_,
                                       cudaMemcpyHostToDevice));
            checkCudaErrors(cudaMemcpy(dEdges_.get(),
                                       &(graph.edges[0]),
                                       sizeof(uint) * edgesCount_,
                                       cudaMemcpyHostToDevice));
            checkCudaErrors(cudaMemcpy(dWeights_.get(),
                                       &(graph.weights[0]),
                                       sizeof(float) * edgesCount_,
                                       cudaMemcpyHostToDevice));


            thrust::fill(dOutputEdgesFlags_,
                         dOutputEdgesFlags_ + edgesCount_,
                         0);
        }

        static const uint kMaxThreadsPerBlock = 256;

        // Calculates grid parameters of the consecutive kernel calls
        // based on the number of elements in the array.
        void calculateThreadsDistribution(uint totalElements,
                                          uint &blocksCount,
                                          uint &threadsPerBlockCount)
        {
            if (totalElements > kMaxThreadsPerBlock)
            {
                blocksCount =
                    (totalElements + kMaxThreadsPerBlock - 1) /
                    kMaxThreadsPerBlock;

                threadsPerBlockCount = kMaxThreadsPerBlock;
            }
            else
            {
                blocksCount = 1;
                threadsPerBlockCount = totalElements;
            }
        }

        enum AlgorithmStatus { ALGORITHM_NOT_FINISHED, ALGORITHM_FINISHED };

        AlgorithmStatus invokeStep(MemoryPoolsCollection &pools,
                                   Pyramid &segmentations)
        {
            uint blocksCount, threadsPerBlockCount;

            calculateThreadsDistribution(edgesCount_,
                                         blocksCount,
                                         threadsPerBlockCount);
            dim3 gridDimsForEdges(blocksCount, 1, 1);
            dim3 blockDimsForEdges(threadsPerBlockCount, 1, 1);

            calculateThreadsDistribution(verticesCount_,
                                         blocksCount,
                                         threadsPerBlockCount);
            dim3 gridDimsForVertices(blocksCount, 1, 1);
            dim3 blockDimsForVertices(threadsPerBlockCount, 1, 1);

            thrust::device_ptr<uint> dEdgesFlags = pools.uintEdges.get();

            thrust::fill(dEdgesFlags, dEdgesFlags + edgesCount_, 0);

            // Mark the first edge for each vertex in "dEdgesFlags"
            markSegments<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
            (dVertices_.get(), dEdgesFlags.get(), verticesCount_);
            getLastCudaError("markSegments launch failed.");

            // Now find minimum edges for each vertex.
            thrust::device_ptr<uint> dMinScannedEdges =
                pools.uintEdges.get();
            thrust::device_ptr<float> dMinScannedWeights =
                pools.floatEdges.get();

            thrust::inclusive_scan_by_key(
                dEdgesFlags,
                dEdgesFlags + edgesCount_,
                thrust::make_zip_iterator(
                    thrust::make_tuple(dWeights_, dEdges_)),
                thrust::make_zip_iterator(
                    thrust::make_tuple(dMinScannedWeights, dMinScannedEdges)),
                thrust::greater_equal<uint>(),
                thrust::minimum< thrust::tuple<float, uint> >());

            // To make things clear.
            // Let "dEdgesFlags" denote groups of edges that
            // correspond to the same vertices. Then the last edge of each group
            // (in "dMinScannedEdges" and "dMinScannedWeights") is now minimal.

            // Calculate a successor vertex for each vertex. A successor of the
            // vertex v is a neighbouring vertex connected to v
            // by the minimal edge.
            thrust::device_ptr<uint> dSuccessors = pools.uintVertices.get();

            getSuccessors<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
            (dVertices_.get(),
             dMinScannedEdges.get(),
             dSuccessors.get(),
             verticesCount_,
             edgesCount_);
            getLastCudaError("getSuccessors launch failed.");

            pools.uintEdges.put(dMinScannedEdges);
            pools.floatEdges.put(dMinScannedWeights);

            // Remove cyclic successor dependencies. Note that there can be only
            // two vertices in a cycle. See [1] for details.
            removeCycles<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
            (dSuccessors.get(), verticesCount_);
            getLastCudaError("removeCycles launch failed.");

            // Build up an array of startpoints for edges. As already stated,
            // each group of edges denoted by "dEdgesFlags"
            // has the same startpoint.
            thrust::device_ptr<uint> dStartpoints = pools.uintEdges.get();

            thrust::inclusive_scan(dEdgesFlags,
                                   dEdgesFlags + edgesCount_,
                                   dStartpoints);

            addScalar<<< gridDimsForEdges, blockDimsForEdges, 0 >>>
            (dStartpoints.get(), -1, edgesCount_);
            getLastCudaError("addScalar launch failed.");

            // Shrink the chains of successors. New successors will eventually
            // represent superpixels of the new level.
            thrust::device_ptr<uint> dRepresentatives =
                pools.uintVertices.get();

            getRepresentatives
            <<< gridDimsForVertices, blockDimsForVertices, 0 >>>
            (dSuccessors.get(),
             dRepresentatives.get(),
             verticesCount_);
            getLastCudaError("getRepresentatives launch failed.");

            swap(dSuccessors, dRepresentatives);

            pools.uintVertices.put(dRepresentatives);

            // Group vertices by successors' indices.
            thrust::device_ptr<uint> dClusteredVerticesIDs =
                pools.uintVertices.get();

            thrust::sequence(dClusteredVerticesIDs,
                             dClusteredVerticesIDs + verticesCount_);

            thrust::sort(
                thrust::make_zip_iterator(
                    thrust::make_tuple(
                        thrust::device_ptr<uint> (dSuccessors),
                        thrust::device_ptr<uint> (dClusteredVerticesIDs))),
                thrust::make_zip_iterator(
                    thrust::make_tuple(
                        thrust::device_ptr<uint> (dSuccessors +
                                                  verticesCount_),
                        thrust::device_ptr<uint> (dClusteredVerticesIDs +
                                                  verticesCount_))));

            // Mark those groups.
            thrust::device_ptr<uint> dVerticesFlags_ = pools.uintVertices.get();

            thrust::fill(dVerticesFlags_, dVerticesFlags_ + verticesCount_, 0);

            thrust::adjacent_difference(dSuccessors,
                                        dSuccessors + verticesCount_,
                                        dVerticesFlags_,
                                        thrust::not_equal_to<uint>());

            cudaMemset((void *) dVerticesFlags_.get(), 0, sizeof(uint));

            // Assign new indices to the successors (the indices of vertices
            // at the new level).
            thrust::device_ptr<uint> dNewVerticesIDs_ =
                pools.uintVertices.get();

            thrust::inclusive_scan(dVerticesFlags_,
                                   dVerticesFlags_ + verticesCount_,
                                   dNewVerticesIDs_);

            pools.uintVertices.put(dVerticesFlags_);

            // Now we can calculate number of resulting superpixels easily.
            uint newVerticesCount;
            cudaMemcpy(&newVerticesCount,
                       (dNewVerticesIDs_ + verticesCount_ - 1).get(),
                       sizeof(uint),
                       cudaMemcpyDeviceToHost);
            ++newVerticesCount;

            // There are two special cases when we can stop our algorithm:
            // 1) number of vertices in the graph remained unchanged;
            // 2) only one vertex remains.
            if (newVerticesCount == verticesCount_)
            {
                return ALGORITHM_FINISHED;
            }
            else if (newVerticesCount == 1)
            {
                thrust::device_ptr<uint> dDummyVerticesOffsets =
                    pools.uintVertices.get();

                cudaMemset((void *) dDummyVerticesOffsets.get(),
                           0,
                           sizeof(uint));

                thrust::device_ptr<uint> dDummyVerticesIDs =
                    pools.uintVertices.get();

                thrust::sequence(dDummyVerticesIDs,
                                 dDummyVerticesIDs + verticesCount_);

                segmentations.addLevel(1,
                                       verticesCount_,
                                       dDummyVerticesOffsets,
                                       dDummyVerticesIDs);

                return ALGORITHM_FINISHED;
            }

            // Calculate how old vertices IDs map to new vertices IDs.
            thrust::device_ptr<uint> dVerticesMapping =
                pools.uintVertices.get();

            getVerticesMapping
            <<< gridDimsForVertices, blockDimsForVertices, 0 >>>
            (dClusteredVerticesIDs.get(),
             dNewVerticesIDs_.get(),
             dVerticesMapping.get(),
             verticesCount_);
            getLastCudaError("getVerticesMapping launch failed.");

            pools.uintVertices.put(dNewVerticesIDs_);
            pools.uintVertices.put(dClusteredVerticesIDs);
            pools.uintVertices.put(dSuccessors);

            // Invalidate self-loops in the reduced graph (the graph
            // produced by merging all old vertices that have
            // the same successor).
            invalidateLoops<<< gridDimsForEdges, blockDimsForEdges, 0 >>>
            (dStartpoints.get(),
             dVerticesMapping.get(),
             dEdges_.get(),
             edgesCount_);
            getLastCudaError("invalidateLoops launch failed.");

            // Calculate various information about the surviving
            // (new startpoints IDs and IDs of edges) and
            // non-surviving/contracted edges (their weights).
            thrust::device_ptr<uint> dNewStartpoints = pools.uintEdges.get();
            thrust::device_ptr<uint> dSurvivedEdgesIDs = pools.uintEdges.get();

            calculateEdgesInfo<<< gridDimsForEdges, blockDimsForEdges, 0 >>>
            (dStartpoints.get(),
             dVerticesMapping.get(),
             dEdges_.get(),
             dWeights_.get(),
             dNewStartpoints.get(),
             dSurvivedEdgesIDs.get(),
             edgesCount_,
             newVerticesCount);
            getLastCudaError("calculateEdgesInfo launch failed.");

            pools.uintEdges.put(dStartpoints);

            // Group that information by the new startpoints IDs.
            // Keep in mind that we want to build new (reduced) graph and apply
            // the step of the algorithm to that one. Hence we need to
            // preserve the structure of the original graph: neighbours and
            // weights should be grouped by vertex.
            thrust::sort(
                thrust::make_zip_iterator(
                    thrust::make_tuple(dNewStartpoints,
                                       dSurvivedEdgesIDs)),
                thrust::make_zip_iterator(
                    thrust::make_tuple(dNewStartpoints + edgesCount_,
                                       dSurvivedEdgesIDs + edgesCount_)));

            // Find the group of contracted edges.
            uint *invalidEdgesPtr =
                thrust::find_if(
                    dNewStartpoints,
                    dNewStartpoints + edgesCount_,
                    IsGreaterEqualThan<uint>(newVerticesCount)).get();

            // Calculate how many edges there are in the reduced graph.
            uint validEdgesCount =
                static_cast<uint>(invalidEdgesPtr - dNewStartpoints.get());

            // Mark groups of edges corresponding to the same vertex in the
            // reduced graph.
            thrust::adjacent_difference(dNewStartpoints,
                                        dNewStartpoints + edgesCount_,
                                        dEdgesFlags,
                                        thrust::not_equal_to<uint>());

            cudaMemset((void *) dEdgesFlags.get(), 0, sizeof(uint));
            cudaMemset((void *) dEdgesFlags.get(), 1, 1);

            pools.uintEdges.put(dNewStartpoints);

            // Now we are able to build the reduced graph. See "Graph"
            // class for the details on the graph's internal structure.

            // Calculate vertices' offsets for the reduced graph.
            thrust::copy_if(thrust::make_counting_iterator(0U),
                            thrust::make_counting_iterator(validEdgesCount),
                            dEdgesFlags,
                            dVertices_,
                            thrust::identity<uint>()).get();

            pools.uintEdges.put(dEdgesFlags);

            // Build up a neighbourhood for each vertex in the reduced graph
            // (this includes recalculating edges' weights).
            calculateThreadsDistribution(validEdgesCount,
                                         blocksCount,
                                         threadsPerBlockCount);
            dim3 newGridDimsForEdges(blocksCount, 1, 1);
            dim3 newBlockDimsForEdges(threadsPerBlockCount, 1, 1);

            thrust::device_ptr<uint> dNewEdges = pools.uintEdges.get();
            thrust::device_ptr<float> dNewWeights = pools.floatEdges.get();

            makeNewEdges<<< newGridDimsForEdges,
                         newBlockDimsForEdges,
                         0 >>>
                         (dSurvivedEdgesIDs.get(),
                          dVerticesMapping.get(),
                          dEdges_.get(),
                          dWeights_.get(),
                          dNewEdges.get(),
                          dNewWeights.get(),
                          validEdgesCount);
            getLastCudaError("makeNewEdges launch failed.");

            swap(dEdges_, dNewEdges);
            swap(dWeights_, dNewWeights);

            pools.uintEdges.put(dNewEdges);
            pools.floatEdges.put(dNewWeights);

            pools.uintEdges.put(dSurvivedEdgesIDs);

            // The graph's reconstruction is now finished.

            // Build new level of the segmentation tree. It is a trivial task
            // as we already have "dVerticesMapping" that contains all
            // sufficient information about the vertices' transformations.
            thrust::device_ptr<uint> dVerticesIDs =
                pools.uintVertices.get();
            thrust::device_ptr<uint> dNewVerticesOffsets =
                pools.uintVertices.get();

            thrust::sequence(dVerticesIDs, dVerticesIDs + verticesCount_);

            thrust::sort_by_key(dVerticesMapping,
                                dVerticesMapping + verticesCount_,
                                dVerticesIDs);

            thrust::unique_by_key_copy(dVerticesMapping,
                                       dVerticesMapping + verticesCount_,
                                       thrust::make_counting_iterator(0),
                                       thrust::make_discard_iterator(),
                                       dNewVerticesOffsets);

            segmentations.addLevel(newVerticesCount,
                                   verticesCount_,
                                   dNewVerticesOffsets,
                                   dVerticesIDs);

            pools.uintVertices.put(dVerticesIDs);
            pools.uintVertices.put(dNewVerticesOffsets);
            pools.uintVertices.put(dVerticesMapping);

            // We can now safely set new counts for vertices and edges.
            verticesCount_ = newVerticesCount;
            edgesCount_ = validEdgesCount;

            return ALGORITHM_NOT_FINISHED;
        }

        uint verticesCount_;
        uint edgesCount_;

        thrust::device_ptr<uint> dVertices_;
        thrust::device_ptr<uint> dEdges_;
        thrust::device_ptr<float> dWeights_;

        thrust::device_ptr<uint> dOutputEdgesFlags_;
};

// Loads PPM image.
int loadImage(const char *filename,
              const char *executablePath,
              vector<uchar3> &data,
              uint &width,
              uint &height)
{
    const char *imagePath = sdkFindFilePath(filename, executablePath);

    if (imagePath == NULL)
    {
        return -1;
    }

    uchar *dataHandle = NULL;
    unsigned int channels;

    if (!__loadPPM(imagePath, &dataHandle, &width, &height, &channels))
    {
        return -1;
    }

    data.assign(reinterpret_cast<uchar3 *>(dataHandle),
                reinterpret_cast<uchar3 *>(dataHandle) + width * height);

    free(reinterpret_cast<void *>(dataHandle));

    return 0;
}

inline float distance(const uchar3 &first, const uchar3 &second)
{
    int dx = static_cast<int>(first.x) - static_cast<int>(second.x);
    int dy = static_cast<int>(first.y) - static_cast<int>(second.y);
    int dz = static_cast<int>(first.z) - static_cast<int>(second.z);

    uint sqrResult = dx * dx + dy * dy + dz * dz;

    return sqrt(static_cast<float>(sqrResult));
}

// Builds a net-graph for the image with 4-connected pixels.
void buildGraph(const vector<uchar3> &image,
                uint width,
                uint height,
                Graph &graph)
{
    uint totalNodes = static_cast<uint>(image.size());

    graph.vertices.resize(totalNodes);
    graph.edges.reserve(4 * totalNodes - 2 * (width + height));
    graph.weights.reserve(graph.edges.size());

    uint edgesProcessed = 0;

    for (uint y = 0; y < height; ++y)
    {
        for (uint x = 0; x < width; ++x)
        {
            uint nodeIndex = y * width + x;
            const uchar3 &centerPixel = image[nodeIndex];

            graph.vertices[nodeIndex] = edgesProcessed;

            if (y > 0)
            {
                uint lowerNodeIndex = (y - 1) * width + x;
                const uchar3 &lowerPixel = image[lowerNodeIndex];

                graph.edges.push_back(lowerNodeIndex);
                graph.weights.push_back(distance(centerPixel, lowerPixel));

                ++edgesProcessed;
            }

            if (y + 1 < height)
            {
                uint upperNodeIndex = (y + 1) * width + x;
                const uchar3 &upperPixel = image[upperNodeIndex];

                graph.edges.push_back(upperNodeIndex);
                graph.weights.push_back(distance(centerPixel, upperPixel));

                ++edgesProcessed;
            }

            if (x > 0)
            {
                uint leftNodeIndex = y * width + x - 1;
                const uchar3 &leftPixel = image[leftNodeIndex];

                graph.edges.push_back(leftNodeIndex);
                graph.weights.push_back(distance(centerPixel, leftPixel));

                ++edgesProcessed;
            }

            if (x + 1 < width)
            {
                uint rightNodeIndex = y * width + x + 1;
                const uchar3 &rightPixel = image[rightNodeIndex];

                graph.edges.push_back(rightNodeIndex);
                graph.weights.push_back(distance(centerPixel, rightPixel));

                ++edgesProcessed;
            }
        }
    }
}

static char *kDefaultImageName = (char*)"test.ppm";

int main(int argc, char **argv)
{
    vector<uchar3> image;
    uint imageWidth, imageHeight;
    char *imageName;

    printf("%s Starting...\n\n", argv[0]);

    imageName = (char *)kDefaultImageName;

    if (checkCmdLineFlag(argc, (const char **) argv, "file"))
    {
        getCmdLineArgumentString(argc,
                                 (const char **) argv,
                                 "file",
                                 &imageName);
    }

    if (loadImage(imageName, argv[0], image, imageWidth, imageHeight) != 0)
    {
        printf("Failed to open <%s>, program exit...\n", imageName);
        exit(EXIT_FAILURE);
    }

    findCudaDevice(argc, (const char **)argv);

    Graph graph;
    buildGraph(image, imageWidth, imageHeight, graph);

    Pyramid segmentations;

    cout << "* Building segmentation tree... ";
    cout.flush();

    SegmentationTreeBuilder algo;
    float elapsedTime = algo.run(graph, segmentations);

    cout << "done in " << elapsedTime << " (ms)" << endl;

    cout << "* Dumping levels for each tree..." << endl << endl;

    segmentations.dump(imageWidth, imageHeight);

    bool bResults[2];

    bResults[0] = sdkComparePPM("level_00.ppm",
                                sdkFindFilePath("ref_00.ppm", argv[0]),
                                5.0f,
                                0.15f,
                                false);
    bResults[1] = sdkComparePPM("level_09.ppm",
                                sdkFindFilePath("ref_09.ppm", argv[0]),
                                5.0f,
                                0.15f,
                                false);

    exit((bResults[0] && bResults[1]) ? EXIT_SUCCESS : EXIT_FAILURE);
}
#ifndef _COMMON_CUH_
#define _COMMON_CUH_

typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned long long int ullint;

#endif // #ifndef _COMMON_CUH_

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Various kernels and functors used throughout the algorithm. For details
 * on usage see "SegmentationTreeBuilder::invokeStep()".
 */

#ifndef _KERNELS_H_
#define _KERNELS_H_

#include <stdio.h>
#include <thrust/functional.h>

#include "common.cuh"

// Functors used with thrust library.
template <typename Input>
struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>
{
    __host__ __device__ IsGreaterEqualThan(uint upperBound) :
        upperBound_(upperBound) {}

    __host__ __device__ bool operator()(const Input &value) const
    {
        return value >= upperBound_;
    }

    uint upperBound_;
};

// CUDA kernels.
__global__ void addScalar(uint *array, int scalar, uint size)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
    {
        array[tid] += scalar;
    }
}

__global__ void markSegments(const uint *verticesOffsets,
                             uint *flags,
                             uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        flags[verticesOffsets[tid]] = 1;
    }
}

__global__ void getVerticesMapping(const uint *clusteredVerticesIDs,
                                   const uint *newVerticesIDs,
                                   uint *verticesMapping,
                                   uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint vertexID = clusteredVerticesIDs[tid];
        verticesMapping[vertexID] = newVerticesIDs[tid];
    }
}

__global__ void getSuccessors(const uint *verticesOffsets,
                              const uint *minScannedEdges,
                              uint *successors,
                              uint verticesCount,
                              uint edgesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint successorPos = (tid < verticesCount - 1) ?
                            (verticesOffsets[tid + 1] - 1) :
                            (edgesCount - 1);

        successors[tid] = minScannedEdges[successorPos];
    }
}

__global__ void removeCycles(uint *successors,
                             uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint successor = successors[tid];
        uint nextSuccessor = successors[successor];

        if (tid == nextSuccessor)
        {
            if (tid < successor)
            {
                successors[tid] = tid;
            }
            else
            {
                successors[successor] = successor;
            }
        }
    }
}

__global__ void getRepresentatives(const uint *successors,
                                   uint *representatives,
                                   uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint successor = successors[tid];
        uint nextSuccessor = successors[successor];

        while (successor != nextSuccessor)
        {
            successor = nextSuccessor;
            nextSuccessor = successors[nextSuccessor];
        }

        representatives[tid] = successor;
    }
}

__global__ void invalidateLoops(const uint *startpoints,
                                const uint *verticesMapping,
                                uint *edges,
                                uint edgesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < edgesCount)
    {
        uint startpoint = startpoints[tid];
        uint &endpoint = edges[tid];

        uint newStartpoint = verticesMapping[startpoint];
        uint newEndpoint = verticesMapping[endpoint];

        if (newStartpoint == newEndpoint)
        {
            endpoint = UINT_MAX;
        }
    }
}

__global__ void calculateEdgesInfo(const uint *startpoints,
                                   const uint *verticesMapping,
                                   const uint *edges,
                                   const float *weights,
                                   uint *newStartpoints,
                                   uint *survivedEdgesIDs,
                                   uint edgesCount,
                                   uint newVerticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < edgesCount)
    {
        uint startpoint = startpoints[tid];
        uint endpoint = edges[tid];

        newStartpoints[tid] = endpoint < UINT_MAX ?
                              verticesMapping[startpoint] :
                              newVerticesCount + verticesMapping[startpoint];

        survivedEdgesIDs[tid] = endpoint < UINT_MAX ?
                                tid :
                                UINT_MAX;
    }
}

__global__ void makeNewEdges(const uint *survivedEdgesIDs,
                             const uint *verticesMapping,
                             const uint *edges,
                             const float *weights,
                             uint *newEdges,
                             float *newWeights,
                             uint edgesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < edgesCount)
    {
        uint edgeID = survivedEdgesIDs[tid];
        uint oldEdge = edges[edgeID];

        newEdges[tid] = verticesMapping[oldEdge];
        newWeights[tid] = weights[edgeID];
    }
}

#endif // #ifndef _KERNELS_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


#ifndef SORTINGNETWORKS_COMMON_CUH
#define SORTINGNETWORKS_COMMON_CUH


#include "sortingNetworks_common.h"

//Enables maximum occupancy
#define SHARED_SIZE_LIMIT 1024U

//Map to single instructions on G8x / G9x / G100
#define    UMUL(a, b) __umul24((a), (b))
#define UMAD(a, b, c) ( UMUL((a), (b)) + (c) )


__device__ inline void Comparator(
    uint &keyA,
    uint &valA,
    uint &keyB,
    uint &valB,
    uint dir
)
{
    uint t;

    if ((keyA > keyB) == dir)
    {
        t = keyA;
        keyA = keyB;
        keyB = t;
        t = valA;
        valA = valB;
        valB = t;
    }
}


#endif
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


//Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/oemen.htm


#include <assert.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>
#include "sortingNetworks_common.h"
#include "sortingNetworks_common.cuh"


////////////////////////////////////////////////////////////////////////////////
// Monolithic Bacther's sort kernel for short arrays fitting into shared memory
////////////////////////////////////////////////////////////////////////////////
__global__ void oddEvenMergeSortShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint dir
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //Shared memory storage for one or more small vectors
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];

    //Offset to the beginning of subbatch and load data
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

    for (uint size = 2; size <= arrayLength; size <<= 1)
    {
        uint stride = size / 2;
        uint offset = threadIdx.x & (stride - 1);

        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                dir
            );
            stride >>= 1;
        }

        for (; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));

            if (offset >= stride)
                Comparator(
                    s_key[pos - stride], s_val[pos - stride],
                    s_key[pos +      0], s_val[pos +      0],
                    dir
                );
        }
    }

    cg::sync(cta);
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}


////////////////////////////////////////////////////////////////////////////////
// Odd-even merge sort iteration kernel
// for large arrays (not fitting into shared memory)
////////////////////////////////////////////////////////////////////////////////
__global__ void oddEvenMergeGlobal(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint size,
    uint stride,
    uint dir
)
{
    uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x;

    //Odd-even merge
    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));

    if (stride < size / 2)
    {
        uint offset = global_comparatorI & ((size / 2) - 1);

        if (offset >= stride)
        {
            uint keyA = d_SrcKey[pos - stride];
            uint valA = d_SrcVal[pos - stride];
            uint keyB = d_SrcKey[pos +      0];
            uint valB = d_SrcVal[pos +      0];

            Comparator(
                keyA, valA,
                keyB, valB,
                dir
            );

            d_DstKey[pos - stride] = keyA;
            d_DstVal[pos - stride] = valA;
            d_DstKey[pos +      0] = keyB;
            d_DstVal[pos +      0] = valB;
        }
    }
    else
    {
        uint keyA = d_SrcKey[pos +      0];
        uint valA = d_SrcVal[pos +      0];
        uint keyB = d_SrcKey[pos + stride];
        uint valB = d_SrcVal[pos + stride];

        Comparator(
            keyA, valA,
            keyB, valB,
            dir
        );

        d_DstKey[pos +      0] = keyA;
        d_DstVal[pos +      0] = valA;
        d_DstKey[pos + stride] = keyB;
        d_DstVal[pos + stride] = valB;
    }
}


////////////////////////////////////////////////////////////////////////////////
// Interface function
////////////////////////////////////////////////////////////////////////////////
//Helper function
extern "C" uint factorRadix2(uint *log2L, uint L);

extern "C" void oddEvenMergeSort(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint batchSize,
    uint arrayLength,
    uint dir
)
{
    //Nothing to sort
    if (arrayLength < 2)
        return;

    //Only power-of-two array lengths are supported by this implementation
    uint log2L;
    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
    assert(factorizationRemainder == 1);

    dir = (dir != 0);

    uint  blockCount = (batchSize * arrayLength) / SHARED_SIZE_LIMIT;
    uint threadCount = SHARED_SIZE_LIMIT / 2;

    if (arrayLength <= SHARED_SIZE_LIMIT)
    {
        assert(SHARED_SIZE_LIMIT % arrayLength == 0);
        oddEvenMergeSortShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir);
    }
    else
    {
        oddEvenMergeSortShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, SHARED_SIZE_LIMIT, dir);

        for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
            for (unsigned stride = size / 2; stride > 0; stride >>= 1)
            {
                //Unlike with bitonic sort, combining bitonic merge steps with
                //stride = [SHARED_SIZE_LIMIT / 2 .. 1] seems to be impossible as there are
                //dependencies between data elements crossing the SHARED_SIZE_LIMIT borders
                oddEvenMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir);
            }
    }
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


//Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm


#include <assert.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include "sortingNetworks_common.h"
#include "sortingNetworks_common.cuh"


////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into shared memory
////////////////////////////////////////////////////////////////////////////////
__global__ void bitonicSortShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint dir
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //Shared memory storage for one or more short vectors
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];

    //Offset to the beginning of subbatch and load data
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

    for (uint size = 2; size < arrayLength; size <<= 1)
    {
        //Bitonic merge
        uint ddd = dir ^ ((threadIdx.x & (size / 2)) != 0);

        for (uint stride = size / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                ddd
            );
        }
    }

    //ddd == dir for the last bitonic merge step
    {
        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                dir
            );
        }
    }

    cg::sync(cta);
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}


////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into shared memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortShared with the exception of
//even / odd subarrays being sorted in opposite directions
//Bitonic merge accepts both
//Ascending | descending or descending | ascending sorted pairs
__global__ void bitonicSortShared1(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //Shared memory storage for current subarray
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];

    //Offset to the beginning of subarray and load data
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

    for (uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1)
    {
        //Bitonic merge
        uint ddd = (threadIdx.x & (size / 2)) != 0;

        for (uint stride = size / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                ddd
            );
        }
    }

    //Odd / even arrays of SHARED_SIZE_LIMIT elements
    //sorted in opposite directions
    uint ddd = blockIdx.x & 1;
    {
        for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                ddd
            );
        }
    }


    cg::sync(cta);
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}

//Bitonic merge iteration for stride >= SHARED_SIZE_LIMIT
__global__ void bitonicMergeGlobal(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint size,
    uint stride,
    uint dir
)
{
    uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x;
    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);

    //Bitonic merge
    uint ddd = dir ^ ((comparatorI & (size / 2)) != 0);
    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));

    uint keyA = d_SrcKey[pos +      0];
    uint valA = d_SrcVal[pos +      0];
    uint keyB = d_SrcKey[pos + stride];
    uint valB = d_SrcVal[pos + stride];

    Comparator(
        keyA, valA,
        keyB, valB,
        ddd
    );

    d_DstKey[pos +      0] = keyA;
    d_DstVal[pos +      0] = valA;
    d_DstKey[pos + stride] = keyB;
    d_DstVal[pos + stride] = valB;
}

//Combined bitonic merge steps for
//size > SHARED_SIZE_LIMIT and stride = [1 .. SHARED_SIZE_LIMIT / 2]
__global__ void bitonicMergeShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint size,
    uint dir
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //Shared memory storage for current subarray
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];

    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

    //Bitonic merge
    uint comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x) & ((arrayLength / 2) - 1);
    uint ddd = dir ^ ((comparatorI & (size / 2)) != 0);

    for (uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1)
    {
        cg::sync(cta);
        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
        Comparator(
            s_key[pos +      0], s_val[pos +      0],
            s_key[pos + stride], s_val[pos + stride],
            ddd
        );
    }

    cg::sync(cta);
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}


////////////////////////////////////////////////////////////////////////////////
// Interface function
////////////////////////////////////////////////////////////////////////////////
//Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L)
{
    if (!L)
    {
        *log2L = 0;
        return 0;
    }
    else
    {
        for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++);

        return L;
    }
}

extern "C" uint bitonicSort(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint batchSize,
    uint arrayLength,
    uint dir
)
{
    //Nothing to sort
    if (arrayLength < 2)
        return 0;

    //Only power-of-two array lengths are supported by this implementation
    uint log2L;
    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
    assert(factorizationRemainder == 1);

    dir = (dir != 0);

    uint  blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
    uint threadCount = SHARED_SIZE_LIMIT / 2;

    if (arrayLength <= SHARED_SIZE_LIMIT)
    {
        assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
        bitonicSortShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir);
    }
    else
    {
        bitonicSortShared1<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal);

        for (uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
            for (unsigned stride = size / 2; stride > 0; stride >>= 1)
                if (stride >= SHARED_SIZE_LIMIT)
                {
                    bitonicMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir);
                }
                else
                {
                    bitonicMergeShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, dir);
                    break;
                }
    }

    return threadCount;
}
/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <thrust/device_ptr.h>
#include <thrust/count.h>
#include <thrust/execution_policy.h>

#include <iostream>
#include <helper_cuda.h>

/////////////////////////////////////////////////////////////////
// Some utility code to define grid_stride_range
// Normally this would be in a header but it's here
// for didactic purposes. Uses
#include "range.hpp"
using namespace util::lang;

// type alias to simplify typing...
template<typename T>
using step_range = typename range_proxy<T>::step_range_proxy;

template <typename T>
__device__
step_range<T> grid_stride_range(T begin, T end) {
    begin += blockDim.x * blockIdx.x + threadIdx.x;
    return range(begin, end).step(gridDim.x * blockDim.x);
}
/////////////////////////////////////////////////////////////////

template <typename T, typename Predicate>
__device__
void count_if(int *count, T *data, int n, Predicate p)
{
  for (auto i : grid_stride_range(0, n)) {
    if (p(data[i])) atomicAdd(count, 1);
  }
}

// Use count_if with a lambda function that searches for x, y, z or w
// Note the use of range-based for loop and initializer_list inside the functor
// We use auto so we don't have to know the type of the functor or array
__global__
void xyzw_frequency(int *count, char *text, int n)
{
  const char letters[] { 'x','y','z','w' };

  count_if(count, text, n, [&](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
}

__global__
void xyzw_frequency_thrust_device(int *count, char *text, int n)
{
  const char letters[] { 'x','y','z','w' };
  *count = thrust::count_if(thrust::device, text, text+n, [=](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
}

// a bug in Thrust 1.8 causes warnings when this is uncommented
// so commented out by default -- fixed in Thrust master branch
#if 0
void xyzw_frequency_thrust_host(int *count, char *text, int n)
{
  const char letters[] { 'x','y','z','w' };
  *count = thrust::count_if(thrust::host, text, text+n, [&](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
}
#endif

int main(int argc, char** argv)
{
  const char *filename = sdkFindFilePath("warandpeace.txt", argv[0]);

  int numBytes = 16*1048576;
  char *h_text = (char*)malloc(numBytes);

  // find first CUDA device
  int devID = findCudaDevice(argc, (const char **)argv);

  char *d_text;
  checkCudaErrors(cudaMalloc((void**)&d_text, numBytes));

  FILE *fp = fopen(filename, "r");
  if (fp == NULL)
  {
    printf("Cannot find the input text file\n. Exiting..\n");
    return EXIT_FAILURE;
  }
  int len = fread(h_text, sizeof(char), numBytes, fp);
  fclose(fp);
  std::cout << "Read " << len << " byte corpus from " << filename << std::endl;

  checkCudaErrors(cudaMemcpy(d_text, h_text, len, cudaMemcpyHostToDevice));

  int count = 0;
  int *d_count;
  checkCudaErrors(cudaMalloc(&d_count, sizeof(int)));
  checkCudaErrors(cudaMemset(d_count, 0, sizeof(int)));

  // Try uncommenting one kernel call at a time
  xyzw_frequency<<<8, 256>>>(d_count, d_text, len);
  xyzw_frequency_thrust_device<<<1, 1>>>(d_count, d_text, len);
  checkCudaErrors(cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost));

  //xyzw_frequency_thrust_host(&count, h_text, len);

  std::cout << "counted " << count << " instances of 'x', 'y', 'z', or 'w' in \""
  << filename << "\"" << std::endl;

  checkCudaErrors(cudaFree(d_count));
  checkCudaErrors(cudaFree(d_text));

  return EXIT_SUCCESS;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


/*
 * Based on "Designing efficient sorting algorithms for manycore GPUs"
 * by Nadathur Satish, Mark Harris, and Michael Garland
 * http://mgarland.org/files/papers/gpusort-ipdps09.pdf
 *
 * Victor Podlozhnyuk 09/24/2009
 */


#include <assert.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>
#include "mergeSort_common.h"


////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b)
{
    return ((a % b) == 0) ? (a / b) : (a / b + 1);
}

static inline __host__ __device__ uint getSampleCount(uint dividend)
{
    return iDivUp(dividend, SAMPLE_STRIDE);
}

#define W (sizeof(uint) * 8)
static inline __device__ uint nextPowerOfTwo(uint x)
{
    /*
        --x;
        x |= x >> 1;
        x |= x >> 2;
        x |= x >> 4;
        x |= x >> 8;
        x |= x >> 16;
        return ++x;
    */
    return 1U << (W - __clz(x - 1));
}

template<uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
{
    if (L == 0)
    {
        return 0;
    }

    uint pos = 0;

    for (; stride > 0; stride >>= 1)
    {
        uint newPos = umin(pos + stride, L);

        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val)))
        {
            pos = newPos;
        }
    }

    return pos;
}

template<uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
{
    if (L == 0)
    {
        return 0;
    }

    uint pos = 0;

    for (; stride > 0; stride >>= 1)
    {
        uint newPos = umin(pos + stride, L);

        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val)))
        {
            pos = newPos;
        }
    }

    return pos;
}


////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
template<uint sortDir> __global__ void mergeSortSharedKernel(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];

    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

    for (uint stride = 1; stride < arrayLength; stride <<= 1)
    {
        uint     lPos = threadIdx.x & (stride - 1);
        uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
        uint *baseVal = s_val + 2 * (threadIdx.x - lPos);

        cg::sync(cta);
        uint keyA = baseKey[lPos +      0];
        uint valA = baseVal[lPos +      0];
        uint keyB = baseKey[lPos + stride];
        uint valB = baseVal[lPos + stride];
        uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
        uint posB = binarySearchInclusive<sortDir>(keyB, baseKey +      0, stride, stride) + lPos;

        cg::sync(cta);
        baseKey[posA] = keyA;
        baseVal[posA] = valA;
        baseKey[posB] = keyB;
        baseVal[posB] = valB;
    }

    cg::sync(cta);
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}

static void mergeSortShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint batchSize,
    uint arrayLength,
    uint sortDir
)
{
    if (arrayLength < 2)
    {
        return;
    }

    assert(SHARED_SIZE_LIMIT % arrayLength == 0);
    assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
    uint  blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
    uint threadCount = SHARED_SIZE_LIMIT / 2;

    if (sortDir)
    {
        mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
        getLastCudaError("mergeSortShared<1><<<>>> failed\n");
    }
    else
    {
        mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
        getLastCudaError("mergeSortShared<0><<<>>> failed\n");
    }
}


////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
template<uint sortDir> __global__ void generateSampleRanksKernel(
    uint *d_RanksA,
    uint *d_RanksB,
    uint *d_SrcKey,
    uint stride,
    uint N,
    uint threadCount
)
{
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

    if (pos >= threadCount)
    {
        return;
    }

    const uint           i = pos & ((stride / SAMPLE_STRIDE) - 1);
    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
    d_SrcKey += segmentBase;
    d_RanksA += segmentBase / SAMPLE_STRIDE;
    d_RanksB += segmentBase / SAMPLE_STRIDE;

    const uint segmentElementsA = stride;
    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
    const uint  segmentSamplesA = getSampleCount(segmentElementsA);
    const uint  segmentSamplesB = getSampleCount(segmentElementsB);

    if (i < segmentSamplesA)
    {
        d_RanksA[i] = i * SAMPLE_STRIDE;
        d_RanksB[i] = binarySearchExclusive<sortDir>(
                          d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride,
                          segmentElementsB, nextPowerOfTwo(segmentElementsB)
                      );
    }

    if (i < segmentSamplesB)
    {
        d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
        d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
                                                     d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0,
                                                     segmentElementsA, nextPowerOfTwo(segmentElementsA)
                                                 );
    }
}

static void generateSampleRanks(
    uint *d_RanksA,
    uint *d_RanksB,
    uint *d_SrcKey,
    uint stride,
    uint N,
    uint sortDir
)
{
    uint lastSegmentElements = N % (2 * stride);
    uint         threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

    if (sortDir)
    {
        generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
        getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
    }
    else
    {
        generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
        getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
    }
}


////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeRanksAndIndicesKernel(
    uint *d_Limits,
    uint *d_Ranks,
    uint stride,
    uint N,
    uint threadCount
)
{
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

    if (pos >= threadCount)
    {
        return;
    }

    const uint           i = pos & ((stride / SAMPLE_STRIDE) - 1);
    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
    d_Ranks  += (pos - i) * 2;
    d_Limits += (pos - i) * 2;

    const uint segmentElementsA = stride;
    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
    const uint  segmentSamplesA = getSampleCount(segmentElementsA);
    const uint  segmentSamplesB = getSampleCount(segmentElementsB);

    if (i < segmentSamplesA)
    {
        uint dstPos = binarySearchExclusive<1U>(d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
        d_Limits[dstPos] = d_Ranks[i];
    }

    if (i < segmentSamplesB)
    {
        uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
        d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
    }
}

static void mergeRanksAndIndices(
    uint *d_LimitsA,
    uint *d_LimitsB,
    uint *d_RanksA,
    uint *d_RanksB,
    uint stride,
    uint N
)
{
    uint lastSegmentElements = N % (2 * stride);
    uint         threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
        d_LimitsA,
        d_RanksA,
        stride,
        N,
        threadCount
    );
    getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");

    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
        d_LimitsB,
        d_RanksB,
        stride,
        N,
        threadCount
    );
    getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
}


////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals
////////////////////////////////////////////////////////////////////////////////
template<uint sortDir> inline __device__ void merge(
    uint *dstKey,
    uint *dstVal,
    uint *srcAKey,
    uint *srcAVal,
    uint *srcBKey,
    uint *srcBVal,
    uint lenA,
    uint nPowTwoLenA,
    uint lenB,
    uint nPowTwoLenB,
    cg::thread_block cta
)
{
    uint keyA, valA, keyB, valB, dstPosA, dstPosB;

    if (threadIdx.x < lenA)
    {
        keyA = srcAKey[threadIdx.x];
        valA = srcAVal[threadIdx.x];
        dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
    }

    if (threadIdx.x < lenB)
    {
        keyB = srcBKey[threadIdx.x];
        valB = srcBVal[threadIdx.x];
        dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
    }

    cg::sync(cta);

    if (threadIdx.x < lenA)
    {
        dstKey[dstPosA] = keyA;
        dstVal[dstPosA] = valA;
    }

    if (threadIdx.x < lenB)
    {
        dstKey[dstPosB] = keyB;
        dstVal[dstPosB] = valB;
    }
}

template<uint sortDir> __global__ void mergeElementaryIntervalsKernel(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint *d_LimitsA,
    uint *d_LimitsB,
    uint stride,
    uint N
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint s_key[2 * SAMPLE_STRIDE];
    __shared__ uint s_val[2 * SAMPLE_STRIDE];

    const uint   intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
    d_SrcKey += segmentBase;
    d_SrcVal += segmentBase;
    d_DstKey += segmentBase;
    d_DstVal += segmentBase;

    //Set up threadblock-wide parameters
    __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;

    if (threadIdx.x == 0)
    {
        uint segmentElementsA = stride;
        uint segmentElementsB = umin(stride, N - segmentBase - stride);
        uint  segmentSamplesA = getSampleCount(segmentElementsA);
        uint  segmentSamplesB = getSampleCount(segmentElementsB);
        uint   segmentSamples = segmentSamplesA + segmentSamplesB;

        startSrcA    = d_LimitsA[blockIdx.x];
        startSrcB    = d_LimitsB[blockIdx.x];
        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
        lenSrcA      = endSrcA - startSrcA;
        lenSrcB      = endSrcB - startSrcB;
        startDstA    = startSrcA + startSrcB;
        startDstB    = startDstA + lenSrcA;
    }

    //Load main input data
    cg::sync(cta);

    if (threadIdx.x < lenSrcA)
    {
        s_key[threadIdx.x +             0] = d_SrcKey[0 + startSrcA + threadIdx.x];
        s_val[threadIdx.x +             0] = d_SrcVal[0 + startSrcA + threadIdx.x];
    }

    if (threadIdx.x < lenSrcB)
    {
        s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
        s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
    }

    //Merge data in shared memory
    cg::sync(cta);
    merge<sortDir>(
        s_key,
        s_val,
        s_key + 0,
        s_val + 0,
        s_key + SAMPLE_STRIDE,
        s_val + SAMPLE_STRIDE,
        lenSrcA, SAMPLE_STRIDE,
        lenSrcB, SAMPLE_STRIDE,
        cta
    );

    //Store merged data
    cg::sync(cta);

    if (threadIdx.x < lenSrcA)
    {
        d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
        d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
    }

    if (threadIdx.x < lenSrcB)
    {
        d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
        d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
    }
}

static void mergeElementaryIntervals(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint *d_LimitsA,
    uint *d_LimitsB,
    uint stride,
    uint N,
    uint sortDir
)
{
    uint lastSegmentElements = N % (2 * stride);
    uint          mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;

    if (sortDir)
    {
        mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
            d_DstKey,
            d_DstVal,
            d_SrcKey,
            d_SrcVal,
            d_LimitsA,
            d_LimitsB,
            stride,
            N
        );
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
    }
    else
    {
        mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
            d_DstKey,
            d_DstVal,
            d_SrcKey,
            d_SrcVal,
            d_LimitsA,
            d_LimitsB,
            stride,
            N
        );
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
    }
}


extern "C" void bitonicSortShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint batchSize,
    uint arrayLength,
    uint sortDir
);

extern "C" void bitonicMergeElementaryIntervals(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint *d_LimitsA,
    uint *d_LimitsB,
    uint stride,
    uint N,
    uint sortDir
);


static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;

extern "C" void initMergeSort(void)
{
    checkCudaErrors(cudaMalloc((void **)&d_RanksA,  MAX_SAMPLE_COUNT * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_RanksB,  MAX_SAMPLE_COUNT * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}

extern "C" void closeMergeSort(void)
{
    checkCudaErrors(cudaFree(d_RanksA));
    checkCudaErrors(cudaFree(d_RanksB));
    checkCudaErrors(cudaFree(d_LimitsB));
    checkCudaErrors(cudaFree(d_LimitsA));
}

extern "C" void mergeSort(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_BufKey,
    uint *d_BufVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint N,
    uint sortDir
)
{
    uint stageCount = 0;

    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++);

    uint *ikey, *ival, *okey, *oval;

    if (stageCount & 1)
    {
        ikey = d_BufKey;
        ival = d_BufVal;
        okey = d_DstKey;
        oval = d_DstVal;
    }
    else
    {
        ikey = d_DstKey;
        ival = d_DstVal;
        okey = d_BufKey;
        oval = d_BufVal;
    }

    assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
    assert(N % SHARED_SIZE_LIMIT == 0);
    mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);

    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1)
    {
        uint lastSegmentElements = N % (2 * stride);

        //Find sample ranks and prepare for limiters merge
        generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);

        //Merge ranks and indices
        mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);

        //Merge elementary intervals
        mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);

        if (lastSegmentElements <= stride)
        {
            //Last merge segment consists of a single array which just needs to be passed through
            checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
            checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
        }

        uint *t;
        t = ikey;
        ikey = okey;
        okey = t;
        t = ival;
        ival = oval;
        oval = t;
    }
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <assert.h>
#include "mergeSort_common.h"


inline __device__ void Comparator(
    uint &keyA,
    uint &valA,
    uint &keyB,
    uint &valB,
    uint arrowDir
)
{
    uint t;

    if ((keyA > keyB) == arrowDir)
    {
        t = keyA;
        keyA = keyB;
        keyB = t;
        t = valA;
        valA = valB;
        valB = t;
    }
}

__global__ void bitonicSortSharedKernel(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint sortDir
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    //Shared memory storage for one or more short vectors
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];

    //Offset to the beginning of subbatch and load data
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

    for (uint size = 2; size < arrayLength; size <<= 1)
    {
        //Bitonic merge
        uint dir = (threadIdx.x & (size / 2)) != 0;

        for (uint stride = size / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                dir
            );
        }
    }

    //ddd == sortDir for the last bitonic merge step
    {
        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1)
        {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                sortDir
            );
        }
    }

    cg::sync(cta);
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}

//Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L)
{
    if (!L)
    {
        *log2L = 0;
        return 0;
    }
    else
    {
        for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++);

        return L;
    }
}

extern "C" void bitonicSortShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint batchSize,
    uint arrayLength,
    uint sortDir
)
{
    //Nothing to sort
    if (arrayLength < 2)
    {
        return;
    }

    //Only power-of-two array lengths are supported by this implementation
    uint log2L;
    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
    assert(factorizationRemainder == 1);

    uint  blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
    uint threadCount = SHARED_SIZE_LIMIT / 2;

    assert(arrayLength <= SHARED_SIZE_LIMIT);
    assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);

    bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
    getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
}


////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals
////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b)
{
    return ((a % b) == 0) ? (a / b) : (a / b + 1);
}

static inline __host__ __device__ uint getSampleCount(uint dividend)
{
    return iDivUp(dividend, SAMPLE_STRIDE);
}

template<uint sortDir> static inline __device__ void ComparatorExtended(
    uint &keyA,
    uint &valA,
    uint &flagA,
    uint &keyB,
    uint &valB,
    uint &flagB,
    uint arrowDir
)
{
    uint t;

    if (
        (!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
        ((arrowDir == sortDir) && (flagA == 1)) ||
        ((arrowDir != sortDir) && (flagB == 1))
    )
    {
        t = keyA;
        keyA = keyB;
        keyB = t;
        t = valA;
        valA = valB;
        valB = t;
        t = flagA;
        flagA = flagB;
        flagB = t;
    }
}

template<uint sortDir> __global__ void bitonicMergeElementaryIntervalsKernel(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint *d_LimitsA,
    uint *d_LimitsB,
    uint stride,
    uint N
)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint s_key[2 * SAMPLE_STRIDE];
    __shared__ uint s_val[2 * SAMPLE_STRIDE];
    __shared__ uint s_inf[2 * SAMPLE_STRIDE];

    const uint   intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
    d_SrcKey += segmentBase;
    d_SrcVal += segmentBase;
    d_DstKey += segmentBase;
    d_DstVal += segmentBase;

    //Set up threadblock-wide parameters
    __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;

    if (threadIdx.x == 0)
    {
        uint segmentElementsA = stride;
        uint segmentElementsB = umin(stride, N - segmentBase - stride);
        uint  segmentSamplesA = stride / SAMPLE_STRIDE;
        uint  segmentSamplesB = getSampleCount(segmentElementsB);
        uint   segmentSamples = segmentSamplesA + segmentSamplesB;

        startSrcA    = d_LimitsA[blockIdx.x];
        startSrcB    = d_LimitsB[blockIdx.x];
        startDst     = startSrcA + startSrcB;

        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
        lenSrcA      = endSrcA - startSrcA;
        lenSrcB      = endSrcB - startSrcB;
    }

    s_inf[threadIdx.x +             0] = 1;
    s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;

    //Load input data
    cg::sync(cta);

    if (threadIdx.x < lenSrcA)
    {
        s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
        s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
        s_inf[threadIdx.x] = 0;
    }

    //Prepare for bitonic merge by inversing the ordering
    if (threadIdx.x < lenSrcB)
    {
        s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
        s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
        s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
    }

    //"Extended" bitonic merge
    for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1)
    {
        cg::sync(cta);
        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
        ComparatorExtended<sortDir>(
            s_key[pos +      0], s_val[pos +      0], s_inf[pos +      0],
            s_key[pos + stride], s_val[pos + stride], s_inf[pos + stride],
            sortDir
        );
    }

    //Store sorted data
    cg::sync(cta);
    d_DstKey += startDst;
    d_DstVal += startDst;

    if (threadIdx.x < lenSrcA)
    {
        d_DstKey[threadIdx.x] = s_key[threadIdx.x];
        d_DstVal[threadIdx.x] = s_val[threadIdx.x];
    }

    if (threadIdx.x < lenSrcB)
    {
        d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
        d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
    }
}

extern "C" void bitonicMergeElementaryIntervals(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint *d_LimitsA,
    uint *d_LimitsB,
    uint stride,
    uint N,
    uint sortDir
)
{
    uint lastSegmentElements = N % (2 * stride);

    uint mergePairs =
        (lastSegmentElements > stride) ?
        getSampleCount(N) :
        (N - lastSegmentElements) / SAMPLE_STRIDE;

    if (sortDir)
    {
        bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
            d_DstKey,
            d_DstVal,
            d_SrcKey,
            d_SrcVal,
            d_LimitsA,
            d_LimitsB,
            stride,
            N
        );
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
    }
    else
    {
        bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
            d_DstKey,
            d_DstVal,
            d_SrcKey,
            d_SrcVal,
            d_LimitsA,
            d_LimitsB,
            stride,
            N
        );
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
    }
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample demonstrates how to use query information on the current system
 * topology using a SDK 8.0 API.
 */

// includes CUDA
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

int main(int argc, char **argv)
{
    int deviceCount = 0;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

    // Enumerates Device <-> Device links
    for (int device1 = 0; device1 < deviceCount; device1++)
    {
        for (int device2 = 0; device2 < deviceCount; device2++)
        {
            if (device1 == device2)
                continue;

            int perfRank = 0;
            int atomicSupported = 0;
            int accessSupported = 0;

            checkCudaErrors(cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
            checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
            checkCudaErrors(cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));

            if (accessSupported)
            {
                std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
                std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
                std::cout << "  * Perf Rank: " << perfRank << std::endl;
            }
        }
    }

    // Enumerates Device <-> Host links
    for (int device = 0; device < deviceCount; device++)
    {
        int atomicSupported = 0;
        checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
        std::cout << "GPU" << device << " <-> CPU:" << std::endl;
        std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
    }

    return 0;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This is a simple test program to measure the memcopy bandwidth of the GPU.
 * It can measure device to device copy bandwidth, host to device copy bandwidth
 * for pageable and pinned memory, and device to host copy bandwidth for pageable
 * and pinned memory.
 *
 * Usage:
 * ./bandwidthTest [option]...
 */

// CUDA runtime
#include <cuda_runtime.h>

// includes
#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
#include <helper_cuda.h>       // helper functions for CUDA error checking and initialization

#include <cuda.h>

#include <memory>
#include <iostream>
#include <cassert>

static const char *sSDKsample = "CUDA Bandwidth Test";

// defines, project
#define MEMCOPY_ITERATIONS  100
#define DEFAULT_SIZE        ( 32 * ( 1 << 20 ) )    //32 M
#define DEFAULT_INCREMENT   (1 << 22)               //4 M
#define CACHE_CLEAR_SIZE    (1 << 24)               //16 M

//shmoo mode defines
#define SHMOO_MEMSIZE_MAX     (1 << 26)         //64 M
#define SHMOO_MEMSIZE_START   (1 << 10)         //1 KB
#define SHMOO_INCREMENT_1KB   (1 << 10)         //1 KB
#define SHMOO_INCREMENT_2KB   (1 << 11)         //2 KB
#define SHMOO_INCREMENT_10KB  (10 * (1 << 10))  //10KB
#define SHMOO_INCREMENT_100KB (100 * (1 << 10)) //100 KB
#define SHMOO_INCREMENT_1MB   (1 << 20)         //1 MB
#define SHMOO_INCREMENT_2MB   (1 << 21)         //2 MB
#define SHMOO_INCREMENT_4MB   (1 << 22)         //4 MB
#define SHMOO_LIMIT_20KB      (20 * (1 << 10))  //20 KB
#define SHMOO_LIMIT_50KB      (50 * (1 << 10))  //50 KB
#define SHMOO_LIMIT_100KB     (100 * (1 << 10)) //100 KB
#define SHMOO_LIMIT_1MB       (1 << 20)         //1 MB
#define SHMOO_LIMIT_16MB      (1 << 24)         //16 MB
#define SHMOO_LIMIT_32MB      (1 << 25)         //32 MB

//enums, project
enum testMode   { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode  { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };

const char *sMemoryCopyKind[] =
{
    "Device to Host",
    "Host to Device",
    "Device to Device",
    NULL
};

const char *sMemoryMode[] =
{
    "PINNED",
    "PAGEABLE",
    NULL
};

// if true, use CPU based timing for everything
static bool bDontUseGPUTiming;

int *pArgc = NULL;
char **pArgv = NULL;

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
                   testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment,
                        memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc);
void printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc);
void printHelp(void);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    // set logfile name and start logs
    printf("[%s] - Starting...\n", sSDKsample);

    int iRetVal = runTest(argc, (const char **)argv);

    if (iRetVal < 0)
    {
        checkCudaErrors(cudaSetDevice(0));
    }

    // finish
    printf("%s\n", (iRetVal==0) ? "Result = PASS" : "Result = FAIL");

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

    exit((iRetVal==0) ? EXIT_SUCCESS : EXIT_FAILURE);
}

///////////////////////////////////////////////////////////////////////////////
//Parse args, run the appropriate tests
///////////////////////////////////////////////////////////////////////////////
int runTest(const int argc, const char **argv)
{
    int start = DEFAULT_SIZE;
    int end = DEFAULT_SIZE;
    int startDevice = 0;
    int endDevice = 0;
    int increment = DEFAULT_INCREMENT;
    testMode mode = QUICK_MODE;
    bool htod = false;
    bool dtoh = false;
    bool dtod = false;
    bool wc = false;
    char *modeStr;
    char *device = NULL;
    printMode printmode = USER_READABLE;
    char *memModeStr = NULL;
    memoryMode memMode = PINNED;

    //process command line args
    if (checkCmdLineFlag(argc, argv, "help"))
    {
        printHelp();
        return 0;
    }

    if (checkCmdLineFlag(argc, argv, "csv"))
    {
        printmode = CSV;
    }

    if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr))
    {
        if (strcmp(memModeStr, "pageable") == 0)
        {
            memMode = PAGEABLE;
        }
        else if (strcmp(memModeStr, "pinned") == 0)
        {
            memMode = PINNED;
        }
        else
        {
            printf("Invalid memory mode - valid modes are pageable or pinned\n");
            printf("See --help for more information\n");
            return -1000;
        }
    }
    else
    {
        //default - pinned memory
        memMode = PINNED;
    }

    if (getCmdLineArgumentString(argc, argv, "device", &device))
    {
        int deviceCount;
        cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

        if (error_id != cudaSuccess)
        {
            printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
            exit(EXIT_FAILURE);
        }

        if (deviceCount == 0)
        {
            printf("!!!!!No devices found!!!!!\n");
            return -2000;
        }

        if (strcmp(device, "all") == 0)
        {
            printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices !!!!!!\n\n");
            startDevice = 0;
            endDevice = deviceCount-1;
        }
        else
        {
            startDevice = endDevice = atoi(device);

            if (startDevice >= deviceCount || startDevice < 0)
            {
                printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", startDevice,0);
                startDevice = endDevice = 0;
            }
        }
    }

    printf("Running on...\n\n");

    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
    {
        cudaDeviceProp deviceProp;
        cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);

        if (error_id == cudaSuccess)
        {
            printf(" Device %d: %s\n", currentDevice, deviceProp.name);

            if (deviceProp.computeMode == cudaComputeModeProhibited)
            {
                fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
                checkCudaErrors(cudaSetDevice(currentDevice));

                exit(EXIT_FAILURE);
            }
        }
        else
        {
            printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
            checkCudaErrors(cudaSetDevice(currentDevice));

            exit(EXIT_FAILURE);
        }
    }

    if (getCmdLineArgumentString(argc, argv, "mode", &modeStr))
    {
        //figure out the mode
        if (strcmp(modeStr, "quick") == 0)
        {
            printf(" Quick Mode\n\n");
            mode = QUICK_MODE;
        }
        else if (strcmp(modeStr, "shmoo") == 0)
        {
            printf(" Shmoo Mode\n\n");
            mode = SHMOO_MODE;
        }
        else if (strcmp(modeStr, "range") == 0)
        {
            printf(" Range Mode\n\n");
            mode = RANGE_MODE;
        }
        else
        {
            printf("Invalid mode - valid modes are quick, range, or shmoo\n");
            printf("See --help for more information\n");
            return -3000;
        }
    }
    else
    {
        //default mode - quick
        printf(" Quick Mode\n\n");
        mode = QUICK_MODE;
    }

    if (checkCmdLineFlag(argc, argv, "htod"))
    {
        htod = true;
    }

    if (checkCmdLineFlag(argc, argv, "dtoh"))
    {
        dtoh = true;
    }

    if (checkCmdLineFlag(argc, argv, "dtod"))
    {
        dtod = true;
    }

#if CUDART_VERSION >= 2020

    if (checkCmdLineFlag(argc, argv, "wc"))
    {
        wc = true;
    }

#endif

    if (checkCmdLineFlag(argc, argv, "cputiming"))
    {
        bDontUseGPUTiming = true;
    }

    if (!htod && !dtoh && !dtod)
    {
        //default:  All
        htod = true;
        dtoh = true;
        dtod = true;
    }

    if (RANGE_MODE == mode)
    {
        if (checkCmdLineFlag(argc, (const char **)argv, "start"))
        {
            start = getCmdLineArgumentInt(argc, argv, "start");

            if (start <= 0)
            {
                printf("Illegal argument - start must be greater than zero\n");
                return -4000;
            }
        }
        else
        {
            printf("Must specify a starting size in range mode\n");
            printf("See --help for more information\n");
            return -5000;
        }

        if (checkCmdLineFlag(argc, (const char **)argv, "end"))
        {
            end = getCmdLineArgumentInt(argc, argv, "end");

            if (end <= 0)
            {
                printf("Illegal argument - end must be greater than zero\n");
                return -6000;
            }

            if (start > end)
            {
                printf("Illegal argument - start is greater than end\n");
                return -7000;
            }
        }
        else
        {
            printf("Must specify an end size in range mode.\n");
            printf("See --help for more information\n");
            return -8000;
        }


        if (checkCmdLineFlag(argc, argv, "increment"))
        {
            increment = getCmdLineArgumentInt(argc, argv, "increment");

            if (increment <= 0)
            {
                printf("Illegal argument - increment must be greater than zero\n");
                return -9000;
            }
        }
        else
        {
            printf("Must specify an increment in user mode\n");
            printf("See --help for more information\n");
            return -10000;
        }
    }

    if (htod)
    {
        testBandwidth((unsigned int)start, (unsigned int)end, (unsigned int)increment,
                      mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
    }

    if (dtoh)
    {
        testBandwidth((unsigned int)start, (unsigned int)end, (unsigned int)increment,
                      mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc);
    }

    if (dtod)
    {
        testBandwidth((unsigned int)start, (unsigned int)end, (unsigned int)increment,
                      mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
    }

    // Ensure that we reset all CUDA Devices in question
    for (int nDevice = startDevice; nDevice <= endDevice; nDevice++)
    {
        cudaSetDevice(nDevice);
    }

    return 0;
}

///////////////////////////////////////////////////////////////////////////////
//  Run a bandwidth test
///////////////////////////////////////////////////////////////////////////////
void
testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
              testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
    switch (mode)
    {
        case QUICK_MODE:
            testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
            break;

        case RANGE_MODE:
            testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
            break;

        case SHMOO_MODE:
            testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
            break;

        default:
            break;
    }
}

//////////////////////////////////////////////////////////////////////
//  Run a quick mode bandwidth test
//////////////////////////////////////////////////////////////////////
void
testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
    testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
}

///////////////////////////////////////////////////////////////////////
//  Run a range mode bandwidth test
//////////////////////////////////////////////////////////////////////
void
testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment,
                   memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
    //count the number of copies we're going to run
    unsigned int count = 1 + ((end - start) / increment);

    unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
    double *bandwidths = (double *) malloc(count * sizeof(double));

    // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
    for (unsigned int i = 0; i < count; i++)
    {
        bandwidths[i] = 0.0;
    }

    // Use the device asked by the user
    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
    {
        cudaSetDevice(currentDevice);

        //run each of the copies
        for (unsigned int i = 0; i < count; i++)
        {

            memSizes[i] = start + i * increment;

            switch (kind)
            {
                case DEVICE_TO_HOST:
                    bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
                    break;

                case HOST_TO_DEVICE:
                    bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
                    break;

                case DEVICE_TO_DEVICE:
                    bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
                    break;
            }
        }
    } // Complete the bandwidth computation on all the devices

    //print results
    if (printmode == CSV)
    {
        printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
    }
    else
    {
        printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
    }

    //clean up
    free(memSizes);
    free(bandwidths);
}

//////////////////////////////////////////////////////////////////////////////
// Intense shmoo mode - covers a large range of values with varying increments
//////////////////////////////////////////////////////////////////////////////
void
testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
{
    //count the number of copies to make
    unsigned int count = 1 + (SHMOO_LIMIT_20KB  / SHMOO_INCREMENT_1KB)
                         + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
                         + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
                         + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
                         + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
                         + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
                         + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);

    unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
    double *bandwidths = (double *) malloc(count * sizeof(double));

    // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
    for (unsigned int i = 0; i < count; i++)
    {
        bandwidths[i] = 0.0;
    }

    // Use the device asked by the user
    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
    {
        cudaSetDevice(currentDevice);
        //Run the shmoo
        int iteration = 0;
        unsigned int memSize = 0;

        while (memSize <= SHMOO_MEMSIZE_MAX)
        {
            if (memSize < SHMOO_LIMIT_20KB)
            {
                memSize += SHMOO_INCREMENT_1KB;
            }
            else if (memSize < SHMOO_LIMIT_50KB)
            {
                memSize += SHMOO_INCREMENT_2KB;
            }
            else if (memSize < SHMOO_LIMIT_100KB)
            {
                memSize += SHMOO_INCREMENT_10KB;
            }
            else if (memSize < SHMOO_LIMIT_1MB)
            {
                memSize += SHMOO_INCREMENT_100KB;
            }
            else if (memSize < SHMOO_LIMIT_16MB)
            {
                memSize += SHMOO_INCREMENT_1MB;
            }
            else if (memSize < SHMOO_LIMIT_32MB)
            {
                memSize += SHMOO_INCREMENT_2MB;
            }
            else
            {
                memSize += SHMOO_INCREMENT_4MB;
            }

            memSizes[iteration] = memSize;

            switch (kind)
            {
                case DEVICE_TO_HOST:
                    bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
                    break;

                case HOST_TO_DEVICE:
                    bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
                    break;

                case DEVICE_TO_DEVICE:
                    bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
                    break;
            }

            iteration++;
            printf(".");
        }
    } // Complete the bandwidth computation on all the devices

    //print results
    printf("\n");

    if (CSV == printmode)
    {
        printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
    }
    else
    {
        printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
    }

    //clean up
    free(memSizes);
    free(bandwidths);
}

///////////////////////////////////////////////////////////////////////////////
//  test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float
testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
{
    StopWatchInterface *timer = NULL;
    float elapsedTimeInMs = 0.0f;
    float bandwidthInMBs = 0.0f;
    unsigned char *h_idata = NULL;
    unsigned char *h_odata = NULL;
    cudaEvent_t start, stop;

    sdkCreateTimer(&timer);
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    //allocate host memory
    if (PINNED == memMode)
    {
        //pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
        checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
        checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
#else
        checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
        checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
    }
    else
    {
        //pageable memory mode - use malloc
        h_idata = (unsigned char *)malloc(memSize);
        h_odata = (unsigned char *)malloc(memSize);

        if (h_idata == 0 || h_odata == 0)
        {
            fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
            exit(EXIT_FAILURE);
        }
    }

    //initialize the memory
    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
    {
        h_idata[i] = (unsigned char)(i & 0xff);
    }

    // allocate device memory
    unsigned char *d_idata;
    checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));

    //initialize the device memory
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize,
                               cudaMemcpyHostToDevice));

    //copy data from GPU to Host
    sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));

    if (PINNED == memMode)
    {
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
        {
            checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
                                            cudaMemcpyDeviceToHost, 0));
        }
    }
    else
    {
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
        {
            checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize,
                                       cudaMemcpyDeviceToHost));
        }
    }

    checkCudaErrors(cudaEventRecord(stop, 0));

    // make sure GPU has finished copying
    checkCudaErrors(cudaDeviceSynchronize());
    //get the total elapsed time in ms
    sdkStopTimer(&timer);
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));

    if (PINNED != memMode || bDontUseGPUTiming)
    {
        elapsedTimeInMs = sdkGetTimerValue(&timer);
    }

    //calculate bandwidth in MB/s
    bandwidthInMBs = ((float)(1<<10) * memSize * (float)MEMCOPY_ITERATIONS) /
                     (elapsedTimeInMs * (float)(1 << 20));

    //clean up memory
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaEventDestroy(start));
    sdkDeleteTimer(&timer);

    if (PINNED == memMode)
    {
        checkCudaErrors(cudaFreeHost(h_idata));
        checkCudaErrors(cudaFreeHost(h_odata));
    }
    else
    {
        free(h_idata);
        free(h_odata);
    }

    checkCudaErrors(cudaFree(d_idata));

    return bandwidthInMBs;
}

///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a host to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float
testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
{
    StopWatchInterface *timer = NULL;
    float elapsedTimeInMs = 0.0f;
    float bandwidthInMBs = 0.0f;
    cudaEvent_t start, stop;
    sdkCreateTimer(&timer);
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    //allocate host memory
    unsigned char *h_odata = NULL;

    if (PINNED == memMode)
    {
#if CUDART_VERSION >= 2020
        //pinned memory mode - use special function to get OS-pinned memory
        checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
#else
        //pinned memory mode - use special function to get OS-pinned memory
        checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
    }
    else
    {
        //pageable memory mode - use malloc
        h_odata = (unsigned char *)malloc(memSize);

        if (h_odata == 0)
        {
            fprintf(stderr, "Not enough memory available on host to run test!\n");
            exit(EXIT_FAILURE);
        }
    }

    unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
    unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);

    if (h_cacheClear1 == 0 || h_cacheClear2 == 0)
    {
        fprintf(stderr, "Not enough memory available on host to run test!\n");
        exit(EXIT_FAILURE);
    }

    //initialize the memory
    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
    {
        h_odata[i] = (unsigned char)(i & 0xff);
    }

    for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++)
    {
        h_cacheClear1[i] = (unsigned char)(i & 0xff);
        h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
    }

    //allocate device memory
    unsigned char *d_idata;
    checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));

    sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));

    //copy host memory to device memory
    if (PINNED == memMode)
    {
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
        {
            checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
                                            cudaMemcpyHostToDevice, 0));
        }
    }
    else
    {
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
        {
            checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize,
                                       cudaMemcpyHostToDevice));
        }
    }

    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaDeviceSynchronize());
    //total elapsed time in ms
    sdkStopTimer(&timer);
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));

    if (PINNED != memMode || bDontUseGPUTiming)
    {
        elapsedTimeInMs = sdkGetTimerValue(&timer);
    }

    sdkResetTimer(&timer);

    //calculate bandwidth in MB/s
    bandwidthInMBs = ((float)(1<<10) * memSize * (float)MEMCOPY_ITERATIONS) /
                     (elapsedTimeInMs * (float)(1 << 20));

    //clean up memory
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaEventDestroy(start));
    sdkDeleteTimer(&timer);

    if (PINNED == memMode)
    {
        checkCudaErrors(cudaFreeHost(h_odata));
    }
    else
    {
        free(h_odata);
    }

    free(h_cacheClear1);
    free(h_cacheClear2);
    checkCudaErrors(cudaFree(d_idata));

    return bandwidthInMBs;
}

///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float
testDeviceToDeviceTransfer(unsigned int memSize)
{
    StopWatchInterface *timer = NULL;
    float elapsedTimeInMs = 0.0f;
    float bandwidthInMBs = 0.0f;
    cudaEvent_t start, stop;

    sdkCreateTimer(&timer);
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    //allocate host memory
    unsigned char *h_idata = (unsigned char *)malloc(memSize);

    if (h_idata == 0)
    {
        fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
        exit(EXIT_FAILURE);
    }

    //initialize the host memory
    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
    {
        h_idata[i] = (unsigned char)(i & 0xff);
    }

    //allocate device memory
    unsigned char *d_idata;
    checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
    unsigned char *d_odata;
    checkCudaErrors(cudaMalloc((void **) &d_odata, memSize));

    //initialize memory
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize,
                               cudaMemcpyHostToDevice));

    //run the memcopy
    sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));

    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
    {
        checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize,
                                   cudaMemcpyDeviceToDevice));
    }

    checkCudaErrors(cudaEventRecord(stop, 0));

    //Since device to device memory copies are non-blocking,
    //cudaDeviceSynchronize() is required in order to get
    //proper timing.
    checkCudaErrors(cudaDeviceSynchronize());

    //get the total elapsed time in ms
    sdkStopTimer(&timer);
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));

    if (bDontUseGPUTiming)
    {
        elapsedTimeInMs = sdkGetTimerValue(&timer);
    }

    //calculate bandwidth in MB/s
    bandwidthInMBs = 2.0f * ((float)(1<<10) * memSize * (float)MEMCOPY_ITERATIONS) /
                     (elapsedTimeInMs * (float)(1 << 20));

    //clean up memory
    sdkDeleteTimer(&timer);
    free(h_idata);
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaFree(d_idata));
    checkCudaErrors(cudaFree(d_odata));

    return bandwidthInMBs;
}

/////////////////////////////////////////////////////////
//print results in an easily read format
////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc)
{
    printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
    printf(" %s Memory Transfers\n", sMemoryMode[memMode]);

    if (wc)
    {
        printf(" Write-Combined Memory Writes are Enabled");
    }

    printf("   Transfer Size (Bytes)\tBandwidth(MB/s)\n");
    unsigned int i;

    for (i = 0; i < (count - 1); i++)
    {
        printf("   %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]);
    }

    printf("   %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]);
}

///////////////////////////////////////////////////////////////////////////
//print results in a database format
///////////////////////////////////////////////////////////////////////////
void printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc)
{
    std::string sConfig;

    // log config information
    if (kind == DEVICE_TO_DEVICE)
    {
        sConfig += "D2D";
    }
    else
    {
        if (kind == DEVICE_TO_HOST)
        {
            sConfig += "D2H";
        }
        else if (kind == HOST_TO_DEVICE)
        {
            sConfig += "H2D";
        }

        if (memMode == PAGEABLE)
        {
            sConfig += "-Paged";
        }
        else if (memMode == PINNED)
        {
            sConfig += "-Pinned";

            if (wc)
            {
                sConfig += "-WriteCombined";
            }
        }
    }

    unsigned int i;
    double dSeconds = 0.0;

    for (i = 0; i < count; i++)
    {
        dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1<<20));
        printf("bandwidthTest-%s, Bandwidth = %.1f MB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n",
               sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
    }
}

///////////////////////////////////////////////////////////////////////////
//Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void)
{
    printf("Usage:  bandwidthTest [OPTION]...\n");
    printf("Test the bandwidth for device to host, host to device, and device to device transfers\n");
    printf("\n");
    printf("Example:  measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
    printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n");

    printf("\n");
    printf("Options:\n");
    printf("--help\tDisplay this help menu\n");
    printf("--csv\tPrint results as a CSV\n");
    printf("--device=[deviceno]\tSpecify the device device to be used\n");
    printf("  all - compute cumulative bandwidth on all the devices\n");
    printf("  0,1,2,...,n - Specify any particular device to be used\n");
    printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
    printf("  pageable - pageable memory\n");
    printf("  pinned   - non-pageable system memory\n");
    printf("--mode=[MODE]\tSpecify the mode to use\n");
    printf("  quick - performs a quick measurement\n");
    printf("  range - measures a user-specified range of values\n");
    printf("  shmoo - performs an intense shmoo of a large range of values\n");

    printf("--htod\tMeasure host to device transfers\n");
    printf("--dtoh\tMeasure device to host transfers\n");
    printf("--dtod\tMeasure device to device transfers\n");
#if CUDART_VERSION >= 2020
    printf("--wc\tAllocate pinned memory as write-combined\n");
#endif
    printf("--cputiming\tForce CPU-based timing always\n");

    printf("Range mode options\n");
    printf("--start=[SIZE]\tStarting transfer size in bytes\n");
    printf("--end=[SIZE]\tEnding transfer size in bytes\n");
    printf("--increment=[SIZE]\tIncrement size in bytes\n");
}
/*
 * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <helper_cuda.h>
#include <helper_timer.h>
#include "commonDefs.hpp"
#include "commonKernels.hpp"

#define VERIFY_GPU_CORRECTNESS 0

size_t maxSampleSizeInMb = 64;
int numKernelRuns = 20;
int verboseResults = 0;

const char *memAllocTypeStr[MEMALLOC_TYPE_COUNT] = {
    "Managed_Memory_With_Hints",
    "Managed_Memory_With_Hints_FullyAsync",
    "Managed_Memory_NoHints",
    "Zero_Copy",
    "Memcpy_HostMalloc_DeviceCudaMalloc",
    "MemcpyAsync_HostMalloc_DeviceCudaMalloc",
    "Memcpy_HostCudaHostAlloc_DeviceCudaMalloc",
    "MemcpyAsync_HostCudaHostAlloc_DeviceCudaMalloc"};

const char *memAllocTypeShortStr[MEMALLOC_TYPE_COUNT] = {
    "UMhint",   // Managed Memory With Hints
    "UMhntAs",  // Managed Memory With_Hints Async
    "UMeasy",   // Managed_Memory with No Hints
    "0Copy",    // Zero Copy
    "MemCopy",  // USE HOST PAGEABLE AND DEVICE_MEMORY
    "CpAsync",  // USE HOST PAGEABLE AND DEVICE_MEMORY ASYNC
    "CpHpglk",  // USE HOST PAGELOCKED AND DEVICE MEMORY
    "CpPglAs"   // USE HOST PAGELOCKED AND DEVICE MEMORY ASYNC
};

static float RandFloat(float low, float high) {
  float t = (float)rand() / (float)RAND_MAX;
  return (1.0f - t) * low + t * high;
}

void fillMatrixWithRandomValues(float *matrix, unsigned int matrixDim) {
  unsigned int i, j;
  for (i = 0; i < matrixDim; ++i) {
    for (j = 0; j < matrixDim; ++j) {
      matrix[j + i * matrixDim] = RandFloat(0.0f, 10.0f);
    }
  }
}

#if VERIFY_GPU_CORRECTNESS
void verifyMatrixMultiplyCorrectness(float *C, float *A, float *B,
                                     unsigned int matrixDim) {
  unsigned int i, j, k, numErrors = 0;
  for (i = 0; i < matrixDim; ++i) {
    for (j = 0; j < matrixDim; ++j) {
      float result = 0.0f;
      for (k = 0; k < matrixDim; ++k) {
        result += A[k + i * matrixDim] * B[j + k * matrixDim];
      }
      if (fabs(C[j + i * matrixDim] - result) > 0.001 * matrixDim) {
        printf("At [%u, %u]: Expected %f, Found %f\n", i, j, result,
               C[j + i * matrixDim]);
        ++numErrors;
      }
    }
  }
  if (numErrors != 0) {
    printf("%d value mismatches occured\n", numErrors);
    fflush(stdout);
    exit(EXIT_FAILURE);  // exit since value mismatches occured
  }
}
#endif

void copyMatrix(float *dstMatrix, float *srcMatrix, unsigned int matrixDim) {
  size_t size = matrixDim * matrixDim * sizeof(float);
  memcpy(dstMatrix, srcMatrix, size);
}

void verifyMatrixData(float *expectedData, float *observedData,
                      unsigned int matrixDim) {
  unsigned int i, j, numErrors = 0;
  for (i = 0; i < matrixDim; ++i) {
    for (j = 0; j < matrixDim; ++j) {
      if (expectedData[j + i * matrixDim] != observedData[j + i * matrixDim]) {
        ++numErrors;
        if (verboseResults) {
          printf("At [%u, %u]: Expected %f, Found %f\n", i, j,
                 expectedData[j + i * matrixDim],
                 observedData[j + i * matrixDim]);
        }
      }
    }
  }
  if (numErrors != 0) {
    printf("%d value mismatches occured\n", numErrors);
    fflush(stdout);
    exit(EXIT_FAILURE);  // exit since value mismatches occured
  }
}

#define BLOCK_SIZE 32
__global__ void matrixMultiplyKernel(float *C, float *A, float *B,
                                     unsigned int matrixDim) {
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  unsigned int wA = matrixDim;
  unsigned int wB = matrixDim;

  // Index of the first sub-matrix of A processed by the block
  int aBegin = matrixDim * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    __syncthreads();

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll

    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    __syncthreads();
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

void runMatrixMultiplyKernel(unsigned int matrixDim, int allocType,
                             unsigned int numLoops, double *gpuLaunchCallsTimes,
                             double *gpuTransferToCallsTimes,
                             double *gpuTransferFromCallsTimes,
                             double *gpuLaunchAndTransferCallsTimes,
                             double *gpuLaunchTransferSyncTimes,
                             double *cpuAccessTimes, double *overallTimes,
                             int device_id) {
  float *dptrA = NULL, *hptrA = NULL;
  float *dptrB = NULL, *hptrB = NULL;
  float *dptrC = NULL, *hptrC = NULL;
  float *randValuesX = NULL, *randValuesY = NULL;
  float *randValuesVerifyXmulY = NULL, *randValuesVerifyYmulX = NULL;
  bool copyRequired = false, hintsRequired = false;
  bool someTransferOpRequired;
  bool isAsync = false;
  cudaStream_t streamToRunOn;
  unsigned int *latch;
  size_t size = matrixDim * matrixDim * sizeof(float);
  dim3 threads(32, 32);
  dim3 grid(matrixDim / threads.x, matrixDim / threads.y);
  StopWatchInterface *gpuLaunchCallsTimer = 0, *gpuTransferCallsTimer = 0;
  StopWatchInterface *gpuSyncTimer = 0, *cpuAccessTimer = 0;
  sdkCreateTimer(&gpuLaunchCallsTimer);
  sdkCreateTimer(&gpuTransferCallsTimer);
  sdkCreateTimer(&gpuSyncTimer);
  sdkCreateTimer(&cpuAccessTimer);
  unsigned int i;

  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device_id));
  checkCudaErrors(cudaStreamCreate(&streamToRunOn));

  randValuesX = (float *)malloc(size);
  if (!randValuesX) {
    exit(EXIT_FAILURE);  // exit since memory allocation error
  }
  randValuesY = (float *)malloc(size);
  if (!randValuesY) {
    exit(EXIT_FAILURE);  // exit since memory allocation error
  }
  randValuesVerifyXmulY = (float *)malloc(size);
  if (!randValuesVerifyXmulY) {
    exit(EXIT_FAILURE);  // exit since memory allocation error
  }
  randValuesVerifyYmulX = (float *)malloc(size);
  if (!randValuesVerifyYmulX) {
    exit(EXIT_FAILURE);  // exit since memory allocation error
  }
  checkCudaErrors(cudaMalloc(&dptrA, size));
  checkCudaErrors(cudaMalloc(&dptrB, size));
  checkCudaErrors(cudaMalloc(&dptrC, size));

  fillMatrixWithRandomValues(randValuesX, matrixDim);
  fillMatrixWithRandomValues(randValuesY, matrixDim);

  checkCudaErrors(
      cudaMemcpyAsync(dptrA, randValuesX, size, cudaMemcpyHostToDevice));
  checkCudaErrors(
      cudaMemcpyAsync(dptrB, randValuesY, size, cudaMemcpyHostToDevice));
  matrixMultiplyKernel<<<grid, threads>>>(dptrC, dptrA, dptrB, matrixDim);
  checkCudaErrors(cudaMemcpyAsync(randValuesVerifyXmulY, dptrC, size,
                                  cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaStreamSynchronize(NULL));
  matrixMultiplyKernel<<<grid, threads>>>(dptrC, dptrB, dptrA, matrixDim);
  checkCudaErrors(cudaMemcpyAsync(randValuesVerifyYmulX, dptrC, size,
                                  cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaStreamSynchronize(NULL));
#if VERIFY_GPU_CORRECTNESS
  verifyMatrixMultiplyCorrectness(randValuesVerifyXmulY, randValuesX,
                                  randValuesY, matrixDim);
  verifyMatrixMultiplyCorrectness(randValuesVerifyYmulX, randValuesY,
                                  randValuesX, matrixDim);
#endif
  checkCudaErrors(cudaFree(dptrA));
  checkCudaErrors(cudaFree(dptrB));
  checkCudaErrors(cudaFree(dptrC));

  checkCudaErrors(cudaMallocHost(&latch, sizeof(unsigned int)));

  switch (allocType) {
    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY:
    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC:
      hptrA = (float *)malloc(size);
      if (!hptrA) {
        exit(EXIT_FAILURE);  // exit since memory allocation error
      }
      hptrB = (float *)malloc(size);
      if (!hptrB) {
        exit(EXIT_FAILURE);  // exit since memory allocation error
      }
      hptrC = (float *)malloc(size);
      if (!hptrC) {
        exit(EXIT_FAILURE);  // exit since memory allocation error
      }
      checkCudaErrors(cudaMalloc(&dptrA, size));
      checkCudaErrors(cudaMalloc(&dptrB, size));
      checkCudaErrors(cudaMalloc(&dptrC, size));
      copyRequired = true;
      break;

    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY:
    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC:
      checkCudaErrors(cudaMallocHost(&hptrA, size));
      checkCudaErrors(cudaMallocHost(&hptrB, size));
      checkCudaErrors(cudaMallocHost(&hptrC, size));
      checkCudaErrors(cudaMalloc(&dptrA, size));
      checkCudaErrors(cudaMalloc(&dptrB, size));
      checkCudaErrors(cudaMalloc(&dptrC, size));
      copyRequired = true;
      break;

    case USE_ZERO_COPY:
      checkCudaErrors(cudaMallocHost(&hptrA, size));
      checkCudaErrors(cudaMallocHost(&hptrB, size));
      checkCudaErrors(cudaMallocHost(&hptrC, size));
      checkCudaErrors(cudaHostGetDevicePointer(&dptrA, hptrA, 0));
      checkCudaErrors(cudaHostGetDevicePointer(&dptrB, hptrB, 0));
      checkCudaErrors(cudaHostGetDevicePointer(&dptrC, hptrC, 0));
      break;

    case USE_MANAGED_MEMORY:
      checkCudaErrors(cudaMallocManaged(&dptrA, size));
      checkCudaErrors(cudaMallocManaged(&dptrB, size));
      checkCudaErrors(cudaMallocManaged(&dptrC, size));
      hptrA = dptrA;
      hptrB = dptrB;
      hptrC = dptrC;
      break;

    case USE_MANAGED_MEMORY_WITH_HINTS:
    case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC:
      if (deviceProp.concurrentManagedAccess) {
        checkCudaErrors(cudaMallocManaged(&dptrA, size));
        checkCudaErrors(cudaMallocManaged(&dptrB, size));
        checkCudaErrors(cudaMallocManaged(&dptrC, size));
        checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId));
        checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId));
        checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId));
      } else {
        checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost));
        checkCudaErrors(cudaMallocManaged(&dptrB, size, cudaMemAttachHost));
        checkCudaErrors(cudaMallocManaged(&dptrC, size, cudaMemAttachHost));
      }
      hptrA = dptrA;
      hptrB = dptrB;
      hptrC = dptrC;
      hintsRequired = true;
      break;

    default:
      exit(EXIT_FAILURE);  // exit with error
  }

  if (allocType == USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC ||
      allocType == USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC ||
      allocType == USE_MANAGED_MEMORY_WITH_HINTS_ASYNC) {
    isAsync = true;
  }

  someTransferOpRequired = copyRequired || hintsRequired;

  // fill buffers with 0 to avoid any first access page-fault overheads.
  memset(hptrA, 0, size);
  memset(hptrB, 0, size);
  memset(hptrC, 0, size);

  for (i = 0; i < numLoops; i++) {
    cpuAccessTimes[i] = 0.0;
    gpuLaunchCallsTimes[i] = 0.0;
    gpuTransferToCallsTimes[i] = 0.0;
    gpuTransferFromCallsTimes[i] = 0.0;

    sdkStartTimer(&cpuAccessTimer);
    {
      copyMatrix(hptrA, (i & 0x1 == 0) ? randValuesX : randValuesY, matrixDim);
      copyMatrix(hptrB, (i & 0x1 == 0) ? randValuesY : randValuesX, matrixDim);
    }
    sdkStopTimer(&cpuAccessTimer);
    cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer);
    sdkResetTimer(&cpuAccessTimer);

    if (isAsync && hintsRequired) {
      *latch = 0;
      // Prevent any work on stream from starting until all work is pushed
      spinWhileLessThanOne<<<1, 1, 0, streamToRunOn>>>(latch);
    }

    if (someTransferOpRequired) {
      sdkStartTimer(&gpuTransferCallsTimer);
      if (copyRequired) {
        if (isAsync) {
          checkCudaErrors(cudaMemcpyAsync(
              dptrA, hptrA, size, cudaMemcpyHostToDevice, streamToRunOn));
          checkCudaErrors(cudaMemcpyAsync(
              dptrB, hptrB, size, cudaMemcpyHostToDevice, streamToRunOn));
        } else {
          checkCudaErrors(
              cudaMemcpy(dptrA, hptrA, size, cudaMemcpyHostToDevice));
          checkCudaErrors(
              cudaMemcpy(dptrB, hptrB, size, cudaMemcpyHostToDevice));
        }
      }
      if (hintsRequired) {
        if (deviceProp.concurrentManagedAccess) {
          checkCudaErrors(
              cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn));
          checkCudaErrors(
              cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn));
          checkCudaErrors(
              cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn));
        } else {
          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0,
                                                   cudaMemAttachGlobal));
          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0,
                                                   cudaMemAttachGlobal));
          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0,
                                                   cudaMemAttachGlobal));
        }
        if (!isAsync) {
          checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
        }
      }

      sdkStopTimer(&gpuTransferCallsTimer);
      gpuTransferToCallsTimes[i] +=
          sdkGetAverageTimerValue(&gpuTransferCallsTimer);
      sdkResetTimer(&gpuTransferCallsTimer);
    }

    sdkStartTimer(&gpuLaunchCallsTimer);
    {
      matrixMultiplyKernel<<<grid, threads, 0, streamToRunOn>>>(
          dptrC, dptrA, dptrB, matrixDim);
      if (!isAsync) {
        checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
      }
    }
    sdkStopTimer(&gpuLaunchCallsTimer);

    gpuLaunchCallsTimes[i] += sdkGetAverageTimerValue(&gpuLaunchCallsTimer);
    sdkResetTimer(&gpuLaunchCallsTimer);

    if (someTransferOpRequired) {
      sdkStartTimer(&gpuTransferCallsTimer);
      if (hintsRequired) {
        if (deviceProp.concurrentManagedAccess) {
          checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId));
          checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId));
          checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId));
        } else {
          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0,
                                                   cudaMemAttachHost));
          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0,
                                                   cudaMemAttachHost));
          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0,
                                                   cudaMemAttachHost));
        }
        if (!isAsync) {
          checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
        }
      }
      if (copyRequired) {
        if (isAsync) {
          checkCudaErrors(cudaMemcpyAsync(
              hptrC, dptrC, size, cudaMemcpyDeviceToHost, streamToRunOn));
        } else {
          checkCudaErrors(
              cudaMemcpy(hptrC, dptrC, size, cudaMemcpyDeviceToHost));
        }
      }
      sdkStopTimer(&gpuTransferCallsTimer);
      gpuTransferFromCallsTimes[i] +=
          sdkGetAverageTimerValue(&gpuTransferCallsTimer);
      sdkResetTimer(&gpuTransferCallsTimer);
    }
    gpuLaunchAndTransferCallsTimes[i] = gpuLaunchCallsTimes[i] +
                                        gpuTransferToCallsTimes[i] +
                                        gpuTransferFromCallsTimes[i];
    gpuLaunchTransferSyncTimes[i] = gpuLaunchAndTransferCallsTimes[i];
    if (isAsync) {
      sdkStartTimer(&gpuSyncTimer);
      {
        if (hintsRequired) {
          *latch = 1;
        }
        checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
      }
      sdkStopTimer(&gpuSyncTimer);
      gpuLaunchTransferSyncTimes[i] += sdkGetAverageTimerValue(&gpuSyncTimer);
      sdkResetTimer(&gpuSyncTimer);
    }

    sdkStartTimer(&cpuAccessTimer);
    {
      verifyMatrixData(
          (i & 0x1 == 0) ? randValuesVerifyXmulY : randValuesVerifyYmulX, hptrC,
          matrixDim);
    }
    sdkStopTimer(&cpuAccessTimer);
    cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer);
    sdkResetTimer(&cpuAccessTimer);
    overallTimes[i] = cpuAccessTimes[i] + gpuLaunchTransferSyncTimes[i];
  }

  switch (allocType) {
    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY:
    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC:
      free(hptrA);
      free(hptrB);
      free(hptrC);
      checkCudaErrors(cudaFree(dptrA));
      checkCudaErrors(cudaFree(dptrB));
      checkCudaErrors(cudaFree(dptrC));
      break;

    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY:
    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC:
      checkCudaErrors(cudaFreeHost(hptrA));
      checkCudaErrors(cudaFreeHost(hptrB));
      checkCudaErrors(cudaFreeHost(hptrC));
      checkCudaErrors(cudaFree(dptrA));
      checkCudaErrors(cudaFree(dptrB));
      checkCudaErrors(cudaFree(dptrC));
      break;

    case USE_ZERO_COPY:
      checkCudaErrors(cudaFreeHost(hptrA));
      checkCudaErrors(cudaFreeHost(hptrB));
      checkCudaErrors(cudaFreeHost(hptrC));
      break;

    case USE_MANAGED_MEMORY:
    case USE_MANAGED_MEMORY_WITH_HINTS:
    case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC:
      checkCudaErrors(cudaFree(dptrA));
      checkCudaErrors(cudaFree(dptrB));
      checkCudaErrors(cudaFree(dptrC));
      break;

    default:
      exit(EXIT_FAILURE);  // exit due to error
  }

  checkCudaErrors(cudaStreamDestroy(streamToRunOn));
  checkCudaErrors(cudaFreeHost(latch));
  free(randValuesX);
  free(randValuesY);
  free(randValuesVerifyXmulY);
  free(randValuesVerifyYmulX);
  sdkDeleteTimer(&gpuLaunchCallsTimer);
  sdkDeleteTimer(&gpuTransferCallsTimer);
  sdkDeleteTimer(&gpuSyncTimer);
  sdkDeleteTimer(&cpuAccessTimer);
}

void matrixMultiplyPerfRunner(bool reportAsBandwidth,
                              bool print_launch_transfer_results,
                              bool print_std_deviation, int device_id) {
  int i;
  unsigned int minMatrixDim = 32;
  unsigned int multiplierDim = 2;
  unsigned int matrixDim;
  unsigned int minSize = minMatrixDim * minMatrixDim * sizeof(float);
  unsigned int maxSize =
      (maxSampleSizeInMb * ONE_MB) /
      4;  // 3 buffers are used, but dividing by 4 (power of 2)
  unsigned int multiplier = multiplierDim * multiplierDim;
  unsigned int numSizesToTest;

  struct testResults *results;
  struct resultsData *gpuLaunchCallsTimes;
  struct resultsData *gpuTransferToCallsTimes;
  struct resultsData *gpuTransferFromCallsTimes;
  struct resultsData *gpuLaunchAndTransferCallsTimes;
  struct resultsData *gpuLaunchTransferSyncTimes;
  struct resultsData *cpuAccessTimes;
  struct resultsData *overallTimes;
  unsigned long *sizesToTest;
  unsigned int j;

  numSizesToTest = findNumSizesToTest(minSize, maxSize, multiplier);

  createAndInitTestResults(&results, "matrixMultiplyPerf", numKernelRuns,
                           numSizesToTest);

  sizesToTest = getPtrSizesToTest(results);

  createResultDataAndAddToTestResults(&gpuLaunchCallsTimes, results,
                                      "GPU Kernel Launch Call Time", false,
                                      reportAsBandwidth);
  createResultDataAndAddToTestResults(&gpuTransferToCallsTimes, results,
                                      "CPU to GPU Transfer Calls Time", false,
                                      reportAsBandwidth);
  createResultDataAndAddToTestResults(&gpuTransferFromCallsTimes, results,
                                      "GPU to CPU Transfer Calls Time", false,
                                      reportAsBandwidth);
  createResultDataAndAddToTestResults(&gpuLaunchAndTransferCallsTimes, results,
                                      "GPU Launch and Transfer Calls Time",
                                      false, reportAsBandwidth);
  createResultDataAndAddToTestResults(&gpuLaunchTransferSyncTimes, results,
                                      "GPU Launch Transfer and Sync Time",
                                      false, reportAsBandwidth);
  createResultDataAndAddToTestResults(
      &cpuAccessTimes, results, "CPU Access Time", false, reportAsBandwidth);
  createResultDataAndAddToTestResults(&overallTimes, results, "Overall Time",
                                      false, reportAsBandwidth);

  printf("Running ");
  for (matrixDim = minMatrixDim, j = 0;
       matrixDim * matrixDim <= maxSize / sizeof(float);
       matrixDim *= multiplierDim, ++j) {
    sizesToTest[j] = matrixDim * matrixDim * sizeof(float);
    for (i = MEMALLOC_TYPE_START; i <= MEMALLOC_TYPE_END; i++) {
      printf(".");
      fflush(stdout);
      runMatrixMultiplyKernel(
          matrixDim, i, numKernelRuns,
          getPtrRunTimesInMs(gpuLaunchCallsTimes, i, j),
          getPtrRunTimesInMs(gpuTransferToCallsTimes, i, j),
          getPtrRunTimesInMs(gpuTransferFromCallsTimes, i, j),
          getPtrRunTimesInMs(gpuLaunchAndTransferCallsTimes, i, j),
          getPtrRunTimesInMs(gpuLaunchTransferSyncTimes, i, j),
          getPtrRunTimesInMs(cpuAccessTimes, i, j),
          getPtrRunTimesInMs(overallTimes, i, j), device_id);
    }
  }
  printf("\n");
  printResults(results, print_launch_transfer_results, print_std_deviation);
  freeTestResultsAndAllResultsData(results);
}

static void usage() {
  printf(
      "./cudaMemoryTypesPerf [-device=<device_id>] [-reportAsBandwidth] "
      "[-print-launch-transfer-results] [-print-std-deviation] [-verbose]\n");
  printf("Options:\n");
  printf(
      "-reportAsBandwidth:             By default time taken is printed, this "
      "option allows to instead print bandwidth.\n");
  printf(
      "-print-launch-transfer-results: By default overall results are printed, "
      "this option allows to print data transfers and kernel time as well.\n");
  printf(
      "-print-std-deviation:           Prints std deviation of the results.\n");
  printf(
      "-kernel-iterations=<num>:       Number of times the kernel tests should "
      "be run[default is 100 iterations].\n");
  printf(
      "-device=<device_id>:            Allows to pass GPU Device ID on which "
      "the tests will be run.\n");
  printf("-verbose:                       Prints highly verbose output.\n");
}

int main(int argc, char **argv) {
  bool reportAsBandwidth = false;
  bool print_launch_transfer_results = false;
  bool print_std_deviation = false;

  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
      checkCmdLineFlag(argc, (const char **)argv, "h")) {
    usage();
    printf("&&&& %s WAIVED\n", argv[0]);
    exit(EXIT_WAIVED);
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "reportAsBandwidth")) {
    reportAsBandwidth = true;
  }

  if (checkCmdLineFlag(argc, (const char **)argv,
                       "print-launch-transfer-results")) {
    print_launch_transfer_results = true;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "print-std-deviation")) {
    print_std_deviation = true;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "kernel-iterations")) {
    numKernelRuns =
        getCmdLineArgumentInt(argc, (const char **)argv, "kernel-iterations");
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) {
    verboseResults = 1;
  }

  int device_id = findCudaDevice(argc, (const char **)argv);

  matrixMultiplyPerfRunner(reportAsBandwidth, print_launch_transfer_results,
                           print_std_deviation, device_id);

  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");
  exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "commonKernels.hpp"

__global__ void spinWhileLessThanOne(volatile unsigned int *latch) {
  while (latch[0] < 1)
    ;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <cstdio>
#include <vector>

#include <helper_cuda.h>
#include <helper_timer.h>

using namespace std;

const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";

typedef enum
{
    P2P_WRITE = 0,
    P2P_READ = 1,
}P2PDataTransfer;

typedef enum
{
    CE = 0,
    SM = 1,
}P2PEngine;

P2PEngine p2p_mechanism = CE; // By default use Copy Engine


//Macro for checking cuda errors following a cuda launch or api call
#define cudaCheckError() {                                          \
        cudaError_t e=cudaGetLastError();                                 \
        if(e!=cudaSuccess) {                                              \
            printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
            exit(EXIT_FAILURE);                                           \
        }                                                                 \
    }
__global__ void delay(volatile int *flag, unsigned long long timeout_clocks = 10000000)
{
    // Wait until the application notifies us that it has completed queuing up the
    // experiment, or timeout and exit, allowing the application to make progress
    long long int start_clock, sample_clock;
    start_clock = clock64();

    while (!*flag) {
        sample_clock = clock64();

        if (sample_clock - start_clock > timeout_clocks) {
            break;
        }
    }
}

// This kernel is for demonstration purposes only, not a performant kernel for p2p transfers.
__global__ void copyp2p(int4* __restrict__  dest, int4 const* __restrict__ src, size_t num_elems)
{
    size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
    size_t gridSize = blockDim.x * gridDim.x;

    #pragma unroll(5)
    for (size_t i=globalId; i < num_elems; i+= gridSize)
    {
        dest[i] = src[i];
    }
}

///////////////////////////////////////////////////////////////////////////
//Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void)
{
    printf("Usage:  p2pBandwidthLatencyTest [OPTION]...\n");
    printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");
    printf("\n");

    printf("Options:\n");
    printf("--help\t\tDisplay this help menu\n");
    printf("--p2p_read\tUse P2P reads for data transfers between GPU pairs and show corresponding results.\n \t\tDefault used is P2P write operation.\n");
    printf("--sm_copy\t\tUse SM intiated p2p transfers instead of Copy Engine\n");
}

void checkP2Paccess(int numGPUs)
{
    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        cudaCheckError();

        for (int j = 0; j < numGPUs; j++) {
            int access;
            if (i != j) {
                cudaDeviceCanAccessPeer(&access, i, j);
                cudaCheckError();
                printf("Device=%d %s Access Peer Device=%d\n", i, access ? "CAN" : "CANNOT", j);
            }
        }
    }
    printf("\n***NOTE: In case a device doesn't have P2P access to other one, it falls back to normal memcopy procedure.\nSo you can see lesser Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");
}

void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice, int num_elems, int repeat, bool p2paccess, cudaStream_t streamToRun)
{
    int blockSize = 0;
    int numBlocks = 0;

    cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);
    cudaCheckError();

    if (p2p_mechanism == SM && p2paccess)
    {
        for (int r = 0; r < repeat; r++) {
            copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>((int4*)dest, (int4*)src, num_elems/4);
        }
    }
    else
    {
        for (int r = 0; r < repeat; r++) {
            cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice, sizeof(int)*num_elems, streamToRun);
        }
    }
}

void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method)
{
    int numElems = 10000000;
    int repeat = 5;
    volatile int *flag = NULL;
    vector<int *> buffers(numGPUs);
    vector<int *> buffersD2D(numGPUs); // buffer for D2D, that is, intra-GPU copy
    vector<cudaEvent_t> start(numGPUs);
    vector<cudaEvent_t> stop(numGPUs);
    vector<cudaStream_t> stream(numGPUs);

    cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
    cudaCheckError();

    for (int d = 0; d < numGPUs; d++) {
        cudaSetDevice(d);
        cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
        cudaMalloc(&buffers[d], numElems * sizeof(int));
        cudaCheckError();
        cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
        cudaCheckError();
        cudaEventCreate(&start[d]);
        cudaCheckError();
        cudaEventCreate(&stop[d]);
        cudaCheckError();
    }

    vector<double> bandwidthMatrix(numGPUs * numGPUs);

    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);

        for (int j = 0; j < numGPUs; j++) {
            int access = 0;
            if (p2p) {
                cudaDeviceCanAccessPeer(&access, i, j);
                if (access) {
                    cudaDeviceEnablePeerAccess(j, 0 );
                    cudaCheckError();
                    cudaSetDevice(j);
                    cudaCheckError();
                    cudaDeviceEnablePeerAccess(i, 0 );
                    cudaCheckError();
                    cudaSetDevice(i);
                    cudaCheckError();
                }
            }

            cudaStreamSynchronize(stream[i]);
            cudaCheckError();

            // Block the stream until all the work is queued up
            // DANGER! - cudaMemcpy*Async may infinitely block waiting for
            // room to push the operation, so keep the number of repeatitions
            // relatively low.  Higher repeatitions will cause the delay kernel
            // to timeout and lead to unstable results.
            *flag = 0;
            delay<<< 1, 1, 0, stream[i]>>>(flag);
            cudaCheckError();
            cudaEventRecord(start[i], stream[i]);
            cudaCheckError();

            if (i == j) {
                // Perform intra-GPU, D2D copies
                performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream[i]);

            }
            else {
                if (p2p_method == P2P_WRITE)
                {
                    performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream[i]);
                }
                else
                {
                    performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream[i]);
                }
            }

            cudaEventRecord(stop[i], stream[i]);
            cudaCheckError();

            // Release the queued events
            *flag = 1;
            cudaStreamSynchronize(stream[i]);
            cudaCheckError();

            float time_ms;
            cudaEventElapsedTime(&time_ms, start[i], stop[i]);
            double time_s = time_ms / 1e3;

            double gb = numElems * sizeof(int) * repeat / (double)1e9;
            if (i == j) {
                gb *= 2;    //must count both the read and the write here
            }
            bandwidthMatrix[i * numGPUs + j] = gb / time_s;
            if (p2p && access) {
                cudaDeviceDisablePeerAccess(j);
                cudaSetDevice(j);
                cudaDeviceDisablePeerAccess(i);
                cudaSetDevice(i);
                cudaCheckError();
            }
        }
    }

    printf("   D\\D");

    for (int j = 0; j < numGPUs; j++) {
        printf("%6d ", j);
    }

    printf("\n");

    for (int i = 0; i < numGPUs; i++) {
        printf("%6d ", i);

        for (int j = 0; j < numGPUs; j++) {
            printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
        }

        printf("\n");
    }

    for (int d = 0; d < numGPUs; d++) {
        cudaSetDevice(d);
        cudaFree(buffers[d]);
        cudaFree(buffersD2D[d]);
        cudaCheckError();
        cudaEventDestroy(start[d]);
        cudaCheckError();
        cudaEventDestroy(stop[d]);
        cudaCheckError();
        cudaStreamDestroy(stream[d]);
        cudaCheckError();
    }

    cudaFreeHost((void *)flag);
    cudaCheckError();
}

void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p)
{
    int numElems = 10000000;
    int repeat = 5;
    volatile int *flag = NULL;
    vector<int *> buffers(numGPUs);
    vector<int *> buffersD2D(numGPUs);
    vector<cudaEvent_t> start(numGPUs);
    vector<cudaEvent_t> stop(numGPUs);
    vector<cudaStream_t> stream0(numGPUs);
    vector<cudaStream_t> stream1(numGPUs);

    cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
    cudaCheckError();

    for (int d = 0; d < numGPUs; d++) {
        cudaSetDevice(d);
        cudaMalloc(&buffers[d], numElems * sizeof(int));
        cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
        cudaCheckError();
        cudaEventCreate(&start[d]);
        cudaCheckError();
        cudaEventCreate(&stop[d]);
        cudaCheckError();
        cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);
        cudaCheckError();
        cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);
        cudaCheckError();
    }

    vector<double> bandwidthMatrix(numGPUs * numGPUs);

    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);

        for (int j = 0; j < numGPUs; j++) {
            int access = 0;
            if (p2p) {
                cudaDeviceCanAccessPeer(&access, i, j);
                if (access) {
                    cudaSetDevice(i);
                    cudaDeviceEnablePeerAccess(j, 0);
                    cudaCheckError();
                    cudaSetDevice(j);
                    cudaDeviceEnablePeerAccess(i, 0);
                    cudaCheckError();
                }
            }


            cudaSetDevice(i);
            cudaStreamSynchronize(stream0[i]);
            cudaStreamSynchronize(stream1[j]);
            cudaCheckError();

            // Block the stream until all the work is queued up
            // DANGER! - cudaMemcpy*Async may infinitely block waiting for
            // room to push the operation, so keep the number of repeatitions
            // relatively low.  Higher repeatitions will cause the delay kernel
            // to timeout and lead to unstable results.
            *flag = 0;
            cudaSetDevice(i);
            // No need to block stream1 since it'll be blocked on stream0's event
            delay<<< 1, 1, 0, stream0[i]>>>(flag);
            cudaCheckError();

            // Force stream1 not to start until stream0 does, in order to ensure
            // the events on stream0 fully encompass the time needed for all operations
            cudaEventRecord(start[i], stream0[i]);
            cudaStreamWaitEvent(stream1[j], start[i], 0);

            if (i == j) {
                // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
                performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat, access, stream0[i]);
                performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat, access, stream1[i]);
            }
            else {
                if (access && p2p_mechanism == SM)
                {
                    cudaSetDevice(j);
                }
                performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access, stream1[j]);
                if (access && p2p_mechanism == SM)
                {
                    cudaSetDevice(i);
                }
                performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access, stream0[i]);
            }

            // Notify stream0 that stream1 is complete and record the time of
            // the total transaction
            cudaEventRecord(stop[j], stream1[j]);
            cudaStreamWaitEvent(stream0[i], stop[j], 0);
            cudaEventRecord(stop[i], stream0[i]);

            // Release the queued operations
            *flag = 1;
            cudaStreamSynchronize(stream0[i]);
            cudaStreamSynchronize(stream1[j]);
            cudaCheckError();

            float time_ms;
            cudaEventElapsedTime(&time_ms, start[i], stop[i]);
            double time_s = time_ms / 1e3;

            double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;
            if (i == j) {
                gb *= 2;    //must count both the read and the write here
            }
            bandwidthMatrix[i * numGPUs + j] = gb / time_s;
            if (p2p && access) {
                cudaSetDevice(i);
                cudaDeviceDisablePeerAccess(j);
                cudaSetDevice(j);
                cudaDeviceDisablePeerAccess(i);
            }
        }
    }

    printf("   D\\D");

    for (int j = 0; j < numGPUs; j++) {
        printf("%6d ", j);
    }

    printf("\n");

    for (int i = 0; i < numGPUs; i++) {
        printf("%6d ", i);

        for (int j = 0; j < numGPUs; j++) {
            printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
        }

        printf("\n");
    }

    for (int d = 0; d < numGPUs; d++) {
        cudaSetDevice(d);
        cudaFree(buffers[d]);
        cudaFree(buffersD2D[d]);
        cudaCheckError();
        cudaEventDestroy(start[d]);
        cudaCheckError();
        cudaEventDestroy(stop[d]);
        cudaCheckError();
        cudaStreamDestroy(stream0[d]);
        cudaCheckError();
        cudaStreamDestroy(stream1[d]);
        cudaCheckError();
    }

    cudaFreeHost((void *)flag);
    cudaCheckError();
}

void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method)
{
    int repeat = 100;
    volatile int *flag = NULL;
    StopWatchInterface *stopWatch = NULL;
    vector<int *> buffers(numGPUs);
    vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
    vector<cudaStream_t> stream(numGPUs);
    vector<cudaEvent_t> start(numGPUs);
    vector<cudaEvent_t> stop(numGPUs);

    cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
    cudaCheckError();

    if (!sdkCreateTimer(&stopWatch)) {
        printf("Failed to create stop watch\n");
        exit(EXIT_FAILURE);
    }
    sdkStartTimer(&stopWatch);

    for (int d = 0; d < numGPUs; d++) {
        cudaSetDevice(d);
        cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
        cudaMalloc(&buffers[d], sizeof(int));
        cudaMalloc(&buffersD2D[d], sizeof(int));
        cudaCheckError();
        cudaEventCreate(&start[d]);
        cudaCheckError();
        cudaEventCreate(&stop[d]);
        cudaCheckError();
    }

    vector<double> gpuLatencyMatrix(numGPUs * numGPUs);
    vector<double> cpuLatencyMatrix(numGPUs * numGPUs);

    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);

        for (int j = 0; j < numGPUs; j++) {
            int access = 0;
            if (p2p) {
                cudaDeviceCanAccessPeer(&access, i, j);
                if (access) {
                    cudaDeviceEnablePeerAccess(j, 0);
                    cudaCheckError();
                    cudaSetDevice(j);
                    cudaDeviceEnablePeerAccess(i, 0);
                    cudaSetDevice(i);
                    cudaCheckError();
                }
            }
            cudaStreamSynchronize(stream[i]);
            cudaCheckError();

            // Block the stream until all the work is queued up
            // DANGER! - cudaMemcpy*Async may infinitely block waiting for
            // room to push the operation, so keep the number of repeatitions
            // relatively low.  Higher repeatitions will cause the delay kernel
            // to timeout and lead to unstable results.
            *flag = 0;
            delay<<< 1, 1, 0, stream[i]>>>(flag);
            cudaCheckError();
            cudaEventRecord(start[i], stream[i]);

            sdkResetTimer(&stopWatch);
            if (i == j) {
                // Perform intra-GPU, D2D copies
                performP2PCopy(buffers[i], i, buffersD2D[i], i, 1, repeat, access, stream[i]);
            }
            else {
                if (p2p_method == P2P_WRITE)
                {
                    performP2PCopy(buffers[j], j, buffers[i], i, 1, repeat, access, stream[i]);
                }
                else
                {
                    performP2PCopy(buffers[i], i, buffers[j], j, 1, repeat, access, stream[i]);
                }
            }
            float cpu_time_ms = sdkGetTimerValue(&stopWatch);

            cudaEventRecord(stop[i], stream[i]);
            // Now that the work has been queued up, release the stream
            *flag = 1;
            cudaStreamSynchronize(stream[i]);
            cudaCheckError();

            float gpu_time_ms;
            cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);

            gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;
            cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;
            if (p2p && access) {
                cudaDeviceDisablePeerAccess(j);
                cudaSetDevice(j);
                cudaDeviceDisablePeerAccess(i);
                cudaSetDevice(i);
                cudaCheckError();
            }
        }
    }

    printf("   GPU");

    for (int j = 0; j < numGPUs; j++) {
        printf("%6d ", j);
    }

    printf("\n");

    for (int i = 0; i < numGPUs; i++) {
        printf("%6d ", i);

        for (int j = 0; j < numGPUs; j++) {
            printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);
        }

        printf("\n");
    }

    printf("\n   CPU");

    for (int j = 0; j < numGPUs; j++) {
        printf("%6d ", j);
    }

    printf("\n");

    for (int i = 0; i < numGPUs; i++) {
        printf("%6d ", i);

        for (int j = 0; j < numGPUs; j++) {
            printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);
        }

        printf("\n");
    }

    for (int d = 0; d < numGPUs; d++) {
        cudaSetDevice(d);
        cudaFree(buffers[d]);
        cudaFree(buffersD2D[d]);
        cudaCheckError();
        cudaEventDestroy(start[d]);
        cudaCheckError();
        cudaEventDestroy(stop[d]);
        cudaCheckError();
        cudaStreamDestroy(stream[d]);
        cudaCheckError();
    }

    sdkDeleteTimer(&stopWatch);

    cudaFreeHost((void *)flag);
    cudaCheckError();
}

int main(int argc, char **argv)
{
    int numGPUs;
    P2PDataTransfer p2p_method = P2P_WRITE;

    cudaGetDeviceCount(&numGPUs);
    cudaCheckError();

    //process command line args
    if (checkCmdLineFlag(argc, (const char**)argv, "help"))
    {
        printHelp();
        return 0;
    }

    if (checkCmdLineFlag(argc, (const char**)argv, "p2p_read"))
    {
        p2p_method = P2P_READ;
    }

    if (checkCmdLineFlag(argc, (const char**)argv, "sm_copy"))
    {
        p2p_mechanism = SM;
    }

    printf("[%s]\n", sSampleName);

    //output devices
    for (int i = 0; i < numGPUs; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        cudaCheckError();
        printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i, prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);
    }

    checkP2Paccess(numGPUs);

    //Check peer-to-peer connectivity
    printf("P2P Connectivity Matrix\n");
    printf("     D\\D");

    for (int j = 0; j < numGPUs; j++) {
        printf("%6d", j);
    }
    printf("\n");

    for (int i = 0; i < numGPUs; i++) {
        printf("%6d\t", i);
        for (int j = 0; j < numGPUs; j++) {
            if (i != j) {
                int access;
                cudaDeviceCanAccessPeer(&access, i, j);
                cudaCheckError();
                printf("%6d", (access) ? 1 : 0);
            }
            else {
                printf("%6d", 1);
            }
        }
        printf("\n");
    }

    printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
    outputBandwidthMatrix(numGPUs, false, P2P_WRITE);
    printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");
    outputBandwidthMatrix(numGPUs, true, P2P_WRITE);
    if (p2p_method == P2P_READ)
    {
        printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");
        outputBandwidthMatrix(numGPUs, true, p2p_method);
    }
    printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
    outputBidirectionalBandwidthMatrix(numGPUs, false);
    printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");
    outputBidirectionalBandwidthMatrix(numGPUs, true);

    printf("P2P=Disabled Latency Matrix (us)\n");
    outputLatencyMatrix(numGPUs, false, P2P_WRITE);
    printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");
    outputLatencyMatrix(numGPUs, true, P2P_WRITE);
    if (p2p_method == P2P_READ)
    {
        printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");
        outputLatencyMatrix(numGPUs, true, p2p_method);
    }

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

    exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "../inc/piestimator.h"

#include <string>
#include <vector>
#include <numeric>
#include <stdexcept>
#include <typeinfo>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <curand_kernel.h>

using std::string;
using std::vector;

// RNG init kernel
__global__ void initRNG(curandState *const rngStates,
                        const unsigned int seed)
{
    // Determine thread ID
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // Initialise the RNG
    curand_init(seed, tid, 0, &rngStates[tid]);
}

__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta)
{
    extern __shared__ unsigned int sdata[];

    // Perform first level of reduction:
    // - Write to shared memory
    unsigned int ltid = threadIdx.x;

    sdata[ltid] = in;
    cg::sync(cta);

    // Do reduction in shared mem
    for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
    {
        if (ltid < s)
        {
            sdata[ltid] += sdata[ltid + s];
        }

        cg::sync(cta);
    }

    return sdata[0];
}

__device__ inline void getPoint(float &x, float &y, curandState &state)
{
    x = curand_uniform(&state);
    y = curand_uniform(&state);
}
__device__ inline void getPoint(double &x, double &y, curandState &state)
{
    x = curand_uniform_double(&state);
    y = curand_uniform_double(&state);
}

// Estimator kernel
template <typename Real>
__global__ void computeValue(unsigned int *const results,
                             curandState *const rngStates,
                             const unsigned int numSims)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Determine thread ID
    unsigned int bid = blockIdx.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    // Initialise the RNG
    curandState localState = rngStates[tid];

    // Count the number of points which lie inside the unit quarter-circle
    unsigned int pointsInside = 0;

    for (unsigned int i = tid ; i < numSims ; i += step)
    {
        Real x;
        Real y;
        getPoint(x, y, localState);
        Real l2norm2 = x * x + y * y;

        if (l2norm2 < static_cast<Real>(1))
        {
            pointsInside++;
        }
    }

    // Reduce within the block
    pointsInside = reduce_sum(pointsInside, cta);

    // Store the result
    if (threadIdx.x == 0)
    {
        results[bid] = pointsInside;
    }
}

template <typename Real>
PiEstimator<Real>::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed)
    : m_numSims(numSims),
      m_device(device),
      m_threadBlockSize(threadBlockSize),
      m_seed(seed)
{
}

template <typename Real>
Real PiEstimator<Real>::operator()()
{
    cudaError_t cudaResult = cudaSuccess;
    struct cudaDeviceProp     deviceProperties;
    struct cudaFuncAttributes funcAttributes;

    // Get device properties
    cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get device properties: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Check precision is valid
    if (typeid(Real) == typeid(double) &&
        (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3)))
    {
        throw std::runtime_error("Device does not have double precision support");
    }

    // Attach to GPU
    cudaResult = cudaSetDevice(m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not set CUDA device: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Determine how to divide the work between cores
    dim3 block;
    dim3 grid;
    block.x = m_threadBlockSize;
    grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

    // Aim to launch around ten or more times as many blocks as there
    // are multiprocessors on the target device.
    unsigned int blocksPerSM = 10;
    unsigned int numSMs      = deviceProperties.multiProcessorCount;

    while (grid.x > 2 * blocksPerSM * numSMs)
    {
        grid.x >>= 1;
    }

    // Get initRNG function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for initRNG kernel");
    }

    // Get computeValue function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for computeValue kernel");
    }

    // Check the dimensions are valid
    if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0])
    {
        throw std::runtime_error("Block X dimension is too large for device");
    }

    if (grid.x > (unsigned int)deviceProperties.maxGridSize[0])
    {
        throw std::runtime_error("Grid X dimension is too large for device");
    }

    // Allocate memory for RNG states
    curandState *d_rngStates = 0;
    cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for RNG states: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for result
    // Each thread block will produce one result
    unsigned int *d_results = 0;
    cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for partial results: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Initialise RNG
    initRNG<<<grid, block>>>(d_rngStates, m_seed);

    // Count the points inside unit quarter-circle
    computeValue<Real><<<grid, block, block.x *sizeof(unsigned int)>>>(d_results, d_rngStates, m_numSims);

    // Copy partial results back
    vector<unsigned int> results(grid.x);
    cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not copy partial results to host: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Complete sum-reduction on host
    Real value = static_cast<Real>(std::accumulate(results.begin(), results.end(), 0));

    // Determine the proportion of points inside the quarter-circle,
    // i.e. the area of the unit quarter-circle
    value /= m_numSims;

    // Value is currently an estimate of the area of a unit quarter-circle, so we can
    // scale to a full circle by multiplying by four. Now since the area of a circle
    // is pi * r^2, and r is one, the value will be an estimate for the value of pi.
    value *= 4;

    // Cleanup
    if (d_rngStates)
    {
        cudaFree(d_rngStates);
        d_rngStates = 0;
    }

    if (d_results)
    {
        cudaFree(d_results);
        d_results = 0;
    }

    return value;
}

// Explicit template instantiation
template class PiEstimator<float>;
template class PiEstimator<double>;
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "../inc/piestimator.h"

#include <string>
#include <vector>
#include <numeric>
#include <stdexcept>
#include <typeinfo>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <curand.h>

using std::string;
using std::vector;

__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta)
{
    extern __shared__ unsigned int sdata[];

    // Perform first level of reduction:
    // - Write to shared memory
    unsigned int ltid = threadIdx.x;

    sdata[ltid] = in;
    cg::sync(cta);

    // Do reduction in shared mem
    for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
    {
        if (ltid < s)
        {
            sdata[ltid] += sdata[ltid + s];
        }

        cg::sync(cta);
    }

    return sdata[0];
}

// Estimator kernel
template <typename Real>
__global__ void computeValue(unsigned int *const results,
                             const Real *const points,
                             const unsigned int numSims)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Determine thread ID
    unsigned int bid = blockIdx.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    // Shift the input/output pointers
    const Real *pointx = points + tid;
    const Real *pointy = pointx + numSims;

    // Count the number of points which lie inside the unit quarter-circle
    unsigned int pointsInside = 0;

    for (unsigned int i = tid ; i < numSims ; i += step, pointx += step, pointy += step)
    {
        Real x = *pointx;
        Real y = *pointy;
        Real l2norm2 = x * x + y * y;

        if (l2norm2 < static_cast<Real>(1))
        {
            pointsInside++;
        }
    }

    // Reduce within the block
    pointsInside = reduce_sum(pointsInside, cta);

    // Store the result
    if (threadIdx.x == 0)
    {
        results[bid] = pointsInside;
    }
}

template <typename Real>
PiEstimator<Real>::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize)
    : m_numSims(numSims),
      m_device(device),
      m_threadBlockSize(threadBlockSize)
{
}

template <typename Real>
Real PiEstimator<Real>::operator()()
{
    cudaError_t cudaResult = cudaSuccess;
    struct cudaDeviceProp     deviceProperties;
    struct cudaFuncAttributes funcAttributes;

    // Get device properties
    cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get device properties: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Check precision is valid
    if (typeid(Real) == typeid(double) &&
        (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3)))
    {
        throw std::runtime_error("Device does not have double precision support");
    }

    // Attach to GPU
    cudaResult = cudaSetDevice(m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not set CUDA device: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Determine how to divide the work between cores
    dim3 block;
    dim3 grid;
    block.x = m_threadBlockSize;
    grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

    // Aim to launch around ten or more times as many blocks as there
    // are multiprocessors on the target device.
    unsigned int blocksPerSM = 10;
    unsigned int numSMs      = deviceProperties.multiProcessorCount;

    while (grid.x > 2 * blocksPerSM * numSMs)
    {
        grid.x >>= 1;
    }

    // Get computeValue function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for computeValue kernel");
    }

    // Check the dimensions are valid
    if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0])
    {
        throw std::runtime_error("Block X dimension is too large for device");
    }

    if (grid.x > (unsigned int)deviceProperties.maxGridSize[0])
    {
        throw std::runtime_error("Grid X dimension is too large for device");
    }

    // Allocate memory for points
    // Each simulation has two random numbers to give X and Y coordinate
    Real *d_points = 0;
    cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for random numbers: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for result
    // Each thread block will produce one result
    unsigned int *d_results = 0;
    cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for partial results: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Generate random points in unit square
    curandStatus_t curandResult;
    curandGenerator_t qrng;

    if (typeid(Real) == typeid(float))
    {
        curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);
    }
    else if (typeid(Real) == typeid(double))
    {
        curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL64);
    }
    else
    {
        string msg("Could not create random number generator of specified type");
        throw std::runtime_error(msg);
    }

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not create quasi-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2);

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not set number of dimensions for quasi-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not set order for quasi-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    if (typeid(Real) == typeid(float))
    {
        curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);
    }
    else if (typeid(Real) == typeid(double))
    {
        curandResult = curandGenerateUniformDouble(qrng, (double *)d_points, 2 * m_numSims);
    }
    else
    {
        string msg("Could not generate random numbers of specified type");
        throw std::runtime_error(msg);
    }

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not generate quasi-random numbers: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    curandResult = curandDestroyGenerator(qrng);

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not destroy quasi-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    // Count the points inside unit quarter-circle
    computeValue<Real><<<grid, block, block.x *sizeof(unsigned int)>>>(d_results, d_points, m_numSims);

    // Copy partial results back
    vector<unsigned int> results(grid.x);
    cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not copy partial results to host: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Complete sum-reduction on host
    Real value = static_cast<Real>(std::accumulate(results.begin(), results.end(), 0));

    // Determine the proportion of points inside the quarter-circle,
    // i.e. the area of the unit quarter-circle
    value /= m_numSims;

    // Value is currently an estimate of the area of a unit quarter-circle, so we can
    // scale to a full circle by multiplying by four. Now since the area of a circle
    // is pi * r^2, and r is one, the value will be an estimate for the value of pi.
    value *= 4;

    // Cleanup
    if (d_points)
    {
        cudaFree(d_points);
        d_points = 0;
    }

    if (d_results)
    {
        cudaFree(d_results);
        d_results = 0;
    }

    return value;
}

// Explicit template instantiation
template class PiEstimator<float>;
template class PiEstimator<double>;
/*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */

// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_cuda.h>
#include <helper_functions.h>

// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex *, const Complex *,
                                                   int, float);

// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);

// Padding functions
int PadData(const Complex *, Complex **, int, const Complex *, Complex **, int);

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);

// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 50
#define FILTER_KERNEL_SIZE 11

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { runTest(argc, argv); }

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
  printf("[simpleCUFFT] is starting...\n");

  findCudaDevice(argc, (const char **)argv);

  // Allocate host memory for the signal
  Complex *h_signal =
      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * SIGNAL_SIZE));

  // Initialize the memory for the signal
  for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
    h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
    h_signal[i].y = 0;
  }

  // Allocate host memory for the filter
  Complex *h_filter_kernel =
      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * FILTER_KERNEL_SIZE));

  // Initialize the memory for the filter
  for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i) {
    h_filter_kernel[i].x = rand() / static_cast<float>(RAND_MAX);
    h_filter_kernel[i].y = 0;
  }

  // Pad signal and filter kernel
  Complex *h_padded_signal;
  Complex *h_padded_filter_kernel;
  int new_size =
      PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_filter_kernel,
              &h_padded_filter_kernel, FILTER_KERNEL_SIZE);
  int mem_size = sizeof(Complex) * new_size;

  // Allocate device memory for signal
  Complex *d_signal;
  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_signal), mem_size));
  // Copy host memory to device
  checkCudaErrors(
      cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice));

  // Allocate device memory for filter kernel
  Complex *d_filter_kernel;
  checkCudaErrors(
      cudaMalloc(reinterpret_cast<void **>(&d_filter_kernel), mem_size));

  // Copy host memory to device
  checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size,
                             cudaMemcpyHostToDevice));

  // CUFFT plan simple API
  cufftHandle plan;
  checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1));

  // CUFFT plan advanced API
  cufftHandle plan_adv;
  size_t workSize;
  long long int new_size_long = new_size;

  checkCudaErrors(cufftCreate(&plan_adv));
  checkCudaErrors(cufftXtMakePlanMany(plan_adv, 1, &new_size_long, NULL, 1, 1,
                                      CUDA_C_32F, NULL, 1, 1, CUDA_C_32F, 1,
                                      &workSize, CUDA_C_32F));
  printf("Temporary buffer size %li bytes\n", workSize);

  // Transform signal and kernel
  printf("Transforming signal cufftExecC2C\n");
  checkCudaErrors(cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(d_signal),
                               reinterpret_cast<cufftComplex *>(d_signal),
                               CUFFT_FORWARD));
  checkCudaErrors(cufftExecC2C(
      plan_adv, reinterpret_cast<cufftComplex *>(d_filter_kernel),
      reinterpret_cast<cufftComplex *>(d_filter_kernel), CUFFT_FORWARD));

  // Multiply the coefficients together and normalize the result
  printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
  ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size,
                                           1.0f / new_size);

  // Check if kernel execution generated and error
  getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");

  // Transform signal back
  printf("Transforming signal back cufftExecC2C\n");
  checkCudaErrors(cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(d_signal),
                               reinterpret_cast<cufftComplex *>(d_signal),
                               CUFFT_INVERSE));

  // Copy device memory to host
  Complex *h_convolved_signal = h_padded_signal;
  checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size,
                             cudaMemcpyDeviceToHost));

  // Allocate host memory for the convolution result
  Complex *h_convolved_signal_ref =
      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * SIGNAL_SIZE));

  // Convolve on the host
  Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE,
           h_convolved_signal_ref);

  // check result
  bool bTestResult = sdkCompareL2fe(
      reinterpret_cast<float *>(h_convolved_signal_ref),
      reinterpret_cast<float *>(h_convolved_signal), 2 * SIGNAL_SIZE, 1e-5f);

  // Destroy CUFFT context
  checkCudaErrors(cufftDestroy(plan));
  checkCudaErrors(cufftDestroy(plan_adv));

  // cleanup memory
  free(h_signal);
  free(h_filter_kernel);
  free(h_padded_signal);
  free(h_padded_filter_kernel);
  free(h_convolved_signal_ref);
  checkCudaErrors(cudaFree(d_signal));
  checkCudaErrors(cudaFree(d_filter_kernel));

  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
            const Complex *filter_kernel, Complex **padded_filter_kernel,
            int filter_kernel_size) {
  int minRadius = filter_kernel_size / 2;
  int maxRadius = filter_kernel_size - minRadius;
  int new_size = signal_size + maxRadius;

  // Pad signal
  Complex *new_data =
      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * new_size));
  memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
  memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
  *padded_signal = new_data;

  // Pad filter
  new_data = reinterpret_cast<Complex *>(malloc(sizeof(Complex) * new_size));
  memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
  memset(new_data + maxRadius, 0,
         (new_size - filter_kernel_size) * sizeof(Complex));
  memcpy(new_data + new_size - minRadius, filter_kernel,
         minRadius * sizeof(Complex));
  *padded_filter_kernel = new_data;

  return new_size;
}

////////////////////////////////////////////////////////////////////////////////
// Filtering operations
////////////////////////////////////////////////////////////////////////////////

// Computes convolution on the host
void Convolve(const Complex *signal, int signal_size,
              const Complex *filter_kernel, int filter_kernel_size,
              Complex *filtered_signal) {
  int minRadius = filter_kernel_size / 2;
  int maxRadius = filter_kernel_size - minRadius;

  // Loop over output element indices
  for (int i = 0; i < signal_size; ++i) {
    filtered_signal[i].x = filtered_signal[i].y = 0;

    // Loop over convolution indices
    for (int j = -maxRadius + 1; j <= minRadius; ++j) {
      int k = i + j;

      if (k >= 0 && k < signal_size) {
        filtered_signal[i] =
            ComplexAdd(filtered_signal[i],
                       ComplexMul(signal[k], filter_kernel[minRadius - j]));
      }
    }
  }
}

////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b) {
  Complex c;
  c.x = a.x + b.x;
  c.y = a.y + b.y;
  return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s) {
  Complex c;
  c.x = s * a.x;
  c.y = s * a.y;
  return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) {
  Complex c;
  c.x = a.x * b.x - a.y * b.y;
  c.y = a.x * b.y + a.y * b.x;
  return c;
}

// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex *a, const Complex *b,
                                                   int size, float scale) {
  const int numThreads = blockDim.x * gridDim.x;
  const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

  for (int i = threadID; i < size; i += numThreads) {
    a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
  }
}
/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */

// System includes
#include <stdlib.h>
#include <stdio.h>

#include <string.h>
#include <math.h>

// CUDA runtime
#include <cuda_runtime.h>

//CUFFT Header file
#include <cufftXt.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

// Complex data type
typedef float2 Complex;

static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale( cufftComplex *, cufftComplex *, int, float);

//Kernel for GPU
void multiplyCoefficient ( cudaLibXtDesc *, cudaLibXtDesc *, int, float, int);

// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);

// Padding functions
int PadData(const Complex *, Complex **, int,
            const Complex *, Complex **, int);

////////////////////////////////////////////////////////////////////////////////
// Data configuration
// The filter size is assumed to be a number smaller than the signal size
///////////////////////////////////////////////////////////////////////////////
const int SIGNAL_SIZE        = 1018;
const int FILTER_KERNEL_SIZE = 11;
const int GPU_COUNT          =  2;

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("\n[simpleCUFFT_MGPU] is starting...\n\n");

    int GPU_N;
    checkCudaErrors(cudaGetDeviceCount(&GPU_N));

    if (GPU_N < GPU_COUNT)
    {
        printf("No. of GPU on node %d\n", GPU_N);
        printf("Two GPUs are required to run simpleCUFFT_MGPU sample code\n");
        exit(EXIT_WAIVED);
    }

    int *major_minor =  (int *) malloc(sizeof(int)*GPU_N*2);
    int found2IdenticalGPUs = 0;
    int nGPUs = 2;
    int *whichGPUs ;
    whichGPUs = (int*) malloc(sizeof(int) * nGPUs);

    for(int i=0; i<GPU_N; i++)
    {
        cudaDeviceProp deviceProp;
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
        major_minor[i*2] = deviceProp.major;
        major_minor[i*2 + 1] = deviceProp.minor;
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    for (int i=0; i<GPU_N; i++)
    {
        for (int j=i+1; j<GPU_N; j++)
        {
            if((major_minor[i*2] == major_minor[j*2]) && (major_minor[i*2 + 1] == major_minor[j*2 + 1]))
            {
                whichGPUs[0] = i;
                whichGPUs[1] = j;
                found2IdenticalGPUs = 1;
                break;
            }
        }
        if (found2IdenticalGPUs)
        {
            break;
        }
    }

    free(major_minor);
    if (!found2IdenticalGPUs)
    {
        printf("No Two GPUs with same architecture found\nWaiving simpleCUFFT_2d_MGPU sample\n");
        exit(EXIT_WAIVED);
    }

    // Allocate host memory for the signal
    Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

    // Initialize the memory for the signal
    for (int i = 0; i < SIGNAL_SIZE; ++i)
    {
        h_signal[i].x = rand() / (float)RAND_MAX;
        h_signal[i].y = 0;
    }

    // Allocate host memory for the filter
    Complex *h_filter_kernel = (Complex *)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE);

    // Initialize the memory for the filter
    for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
    {
        h_filter_kernel[i].x = rand() / (float)RAND_MAX;
        h_filter_kernel[i].y = 0;
    }

    // Pad signal and filter kernel
    Complex *h_padded_signal;
    Complex *h_padded_filter_kernel;
    int new_size = PadData(h_signal, &h_padded_signal, SIGNAL_SIZE,
                           h_filter_kernel, &h_padded_filter_kernel,
                           FILTER_KERNEL_SIZE);

    // cufftCreate() - Create an empty plan
    cufftResult result;
    cufftHandle plan_input;
    checkCudaErrors (cufftCreate (&plan_input));

    // cufftXtSetGPUs() - Define which GPUs to use
    result = cufftXtSetGPUs (plan_input, nGPUs, whichGPUs);

    if (result == CUFFT_INVALID_DEVICE)
    {
        printf ("This sample requires two GPUs on the same board.\n");
        printf ("No such board was found. Waiving sample.\n");
        exit (EXIT_WAIVED);
    }
    else if (result != CUFFT_SUCCESS)
    {
        printf ("cufftXtSetGPUs failed\n"); exit (EXIT_FAILURE);
    }

    //Print the device information to run the code
    printf("\nRunning on GPUs\n");
    for (int i = 0 ; i < nGPUs ; i++)
    {
        cudaDeviceProp deviceProp;
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, whichGPUs[i]));
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", whichGPUs[i], deviceProp.name, deviceProp.major, deviceProp.minor);

    }

    size_t* worksize;
    worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);

    // cufftMakePlan1d() - Create the plan
    checkCudaErrors(cufftMakePlan1d(plan_input, new_size, CUFFT_C2C, 1, worksize));

    // cufftXtMalloc() - Malloc data on multiple GPUs
    cudaLibXtDesc *d_signal ;
    checkCudaErrors(cufftXtMalloc (plan_input, (cudaLibXtDesc **)&d_signal, CUFFT_XT_FORMAT_INPLACE));
    cudaLibXtDesc *d_out_signal ;
    checkCudaErrors(cufftXtMalloc (plan_input, (cudaLibXtDesc **)&d_out_signal, CUFFT_XT_FORMAT_INPLACE));
    cudaLibXtDesc *d_filter_kernel;
    checkCudaErrors(cufftXtMalloc (plan_input, (cudaLibXtDesc **)&d_filter_kernel, CUFFT_XT_FORMAT_INPLACE));
    cudaLibXtDesc *d_out_filter_kernel;
    checkCudaErrors(cufftXtMalloc (plan_input, (cudaLibXtDesc **)&d_out_filter_kernel, CUFFT_XT_FORMAT_INPLACE));

    // cufftXtMemcpy() - Copy data from host to multiple GPUs
    checkCudaErrors(cufftXtMemcpy (plan_input,d_signal, h_padded_signal, CUFFT_COPY_HOST_TO_DEVICE));
    checkCudaErrors(cufftXtMemcpy (plan_input, d_filter_kernel, h_padded_filter_kernel, CUFFT_COPY_HOST_TO_DEVICE));

    // cufftXtExecDescriptorC2C() - Execute FFT on data on multiple GPUs
    checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_signal,  d_signal, CUFFT_FORWARD));
    checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_filter_kernel, d_filter_kernel, CUFFT_FORWARD));

    // cufftXtMemcpy() - Copy the data to natural order on GPUs
    checkCudaErrors(cufftXtMemcpy (plan_input, d_out_signal, d_signal, CUFFT_COPY_DEVICE_TO_DEVICE));
    checkCudaErrors(cufftXtMemcpy (plan_input, d_out_filter_kernel, d_filter_kernel, CUFFT_COPY_DEVICE_TO_DEVICE));

    printf("\n\nValue of Library Descriptor\n");
    printf("Number of GPUs %d\n", d_out_signal->descriptor->nGPUs );
    printf("Device id  %d %d\n", d_out_signal->descriptor->GPUs[0], d_out_signal->descriptor->GPUs[1]);
    printf("Data size on GPU %ld %ld\n", (long)(d_out_signal->descriptor->size[0]/sizeof(cufftComplex)), (long)(d_out_signal->descriptor->size[1]/sizeof(cufftComplex))) ;

    //Multiply the coefficients together and normalize the result
    printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
    multiplyCoefficient(d_out_signal,d_out_filter_kernel, new_size, 1.0f / new_size, nGPUs );

    // cufftXtExecDescriptorC2C() - Execute inverse  FFT on data on multiple GPUs
    printf("Transforming signal back cufftExecC2C\n");
    checkCudaErrors(cufftXtExecDescriptorC2C(plan_input, d_out_signal,  d_out_signal, CUFFT_INVERSE));

    // Create host pointer pointing to padded signal
    Complex *h_convolved_signal = h_padded_signal;

    // Allocate host memory for the convolution result
    Complex *h_convolved_signal_ref = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

    // cufftXtMemcpy() - Copy data from multiple GPUs to host
    checkCudaErrors(cufftXtMemcpy (plan_input,h_convolved_signal, d_out_signal, CUFFT_COPY_DEVICE_TO_HOST));

    // Convolve on the host
    Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel,
             FILTER_KERNEL_SIZE, h_convolved_signal_ref);

    // Compare CPU and GPU result
    bool bTestResult = sdkCompareL2fe((float *)h_convolved_signal_ref,
                                      (float *)h_convolved_signal, 2 * SIGNAL_SIZE,
                                      1e-5f);
    printf("\nvalue of TestResult %d\n", bTestResult);

    // Cleanup memory
    free(whichGPUs);
    free(worksize);
    free(h_signal);
    free(h_filter_kernel);
    free(h_padded_signal);
    free(h_padded_filter_kernel);
    free(h_convolved_signal_ref);

    // cudaXtFree() - Free GPU memory
    checkCudaErrors(cufftXtFree(d_signal));
    checkCudaErrors(cufftXtFree(d_filter_kernel));
    checkCudaErrors(cufftXtFree(d_out_signal));
    checkCudaErrors(cufftXtFree(d_out_filter_kernel));

    // cufftDestroy() - Destroy FFT plan
    checkCudaErrors(cufftDestroy(plan_input));

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

///////////////////////////////////////////////////////////////////////////////////
// Function for padding original data
//////////////////////////////////////////////////////////////////////////////////
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
            const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
    memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
    memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex));
    memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex));
    *padded_filter_kernel = new_data;

    return new_size;
}

////////////////////////////////////////////////////////////////////////////////
// Filtering operations - Computing Convolution on the host
////////////////////////////////////////////////////////////////////////////////
void Convolve(const Complex *signal, int signal_size,
              const Complex *filter_kernel, int filter_kernel_size,
              Complex *filtered_signal)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;

    // Loop over output element indices
    for (int i = 0; i < signal_size; ++i)
    {
        filtered_signal[i].x = filtered_signal[i].y = 0;

        // Loop over convolution indices
        for (int j = - maxRadius + 1; j <= minRadius; ++j)
        {
            int k = i + j;

            if (k >= 0 && k < signal_size)
            {
                filtered_signal[i] = ComplexAdd(filtered_signal[i], ComplexMul(signal[k], filter_kernel[minRadius - j]));
            }
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
//  Launch Kernel on multiple GPU
////////////////////////////////////////////////////////////////////////////////
void  multiplyCoefficient( cudaLibXtDesc *d_signal,cudaLibXtDesc *d_filter_kernel,
                           int new_size, float val , int nGPUs)
{
    int device ;
    //Launch the ComplexPointwiseMulAndScale<<< >>> kernel on multiple GPU
    for(int i=0; i < nGPUs ;i++)
    {
        device = d_signal->descriptor->GPUs[i];

        //Set device
        checkCudaErrors(cudaSetDevice(device));

        //Perform GPU computations
        ComplexPointwiseMulAndScale<<<32, 256>>>((cufftComplex*) d_signal->descriptor->data[i],
                                                 (cufftComplex*) d_filter_kernel->descriptor->data[i],
                                                  int(d_signal->descriptor->size[i]/sizeof(cufftComplex)), val);
    }

    // Wait for device to finish all operation
    for( int i=0; i< nGPUs ; i++ )
    {
        device = d_signal->descriptor->GPUs[i];
        checkCudaErrors(cudaSetDevice(device));
        cudaDeviceSynchronize();
        // Check if kernel execution generated and error
        getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
    }
}


////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
    Complex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size, float scale)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = threadID; i < size; i += numThreads)
    {
        a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
    }
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Example showing the use of CUFFT for fast 1D-convolution using FFT.
 * This sample is the same as simpleCUFFT, except that it uses a callback
 * function to perform the pointwise multiply and scale, on input to the
 * inverse transform.
 *
*/

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_functions.h>
#include <helper_cuda.h>

// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);

// This is the callback routine prototype
static __device__ cufftComplex ComplexPointwiseMulAndScale(void * a, size_t index, void * cb_info, void *sharedmem);

typedef struct _cb_params{
                Complex *filter;
                float scale;
                } cb_params;

// This is the callback routine. It does complex pointwise multiplication with scaling.
static __device__ cufftComplex ComplexPointwiseMulAndScale(void *a, size_t index, void *cb_info, void *sharedmem)
{
        cb_params * my_params = (cb_params *)cb_info;
        return (cufftComplex)ComplexScale(ComplexMul(((Complex *)a)[index],
                                                      (my_params->filter)[index]),
                                          my_params->scale);
}

// Define the device pointer to the callback routine. The host code will fetch this and pass it to CUFFT
 __device__ cufftCallbackLoadC myOwnCallbackPtr = ComplexPointwiseMulAndScale;
// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);

// Padding functions
int PadData(const Complex *, Complex **, int,
            const Complex *, Complex **, int);

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(int argc, char **argv);

// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE        50
#define FILTER_KERNEL_SIZE 11

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    struct cudaDeviceProp properties;
    int device;
    checkCudaErrors(cudaGetDevice(&device));
    checkCudaErrors(cudaGetDeviceProperties(&properties, device));
    if( !(properties.major >= 2) ) {
        printf("simpleCUFFT_callback requires CUDA architecture SM2.0 or higher\n");
        return EXIT_WAIVED;
    }

    return runTest(argc, argv);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUFFT callbacks
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, char **argv)
{
    printf("[simpleCUFFT_callback] is starting...\n");

    findCudaDevice(argc, (const char **)argv);

    // Allocate host memory for the signal
    Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

    // Initialize the memory for the signal
    for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
    {
        h_signal[i].x = rand() / (float)RAND_MAX;
        h_signal[i].y = 0;
    }

    // Allocate host memory for the filter
    Complex *h_filter_kernel = (Complex *)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE);

    // Initialize the memory for the filter
    for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; ++i)
    {
        h_filter_kernel[i].x = rand() / (float)RAND_MAX;
        h_filter_kernel[i].y = 0;
    }

    // Pad signal and filter kernel
    Complex *h_padded_signal;
    Complex *h_padded_filter_kernel;
    int new_size = PadData(h_signal, &h_padded_signal, SIGNAL_SIZE,
                           h_filter_kernel, &h_padded_filter_kernel, FILTER_KERNEL_SIZE);
    int mem_size = sizeof(Complex) * new_size;

    // Allocate device memory for signal
    Complex *d_signal;
    checkCudaErrors(cudaMalloc((void **)&d_signal, mem_size));
    // Copy host memory to device
    checkCudaErrors(cudaMemcpy(d_signal, h_padded_signal, mem_size,
                               cudaMemcpyHostToDevice));

    // Allocate device memory for filter kernel
    Complex *d_filter_kernel;
    checkCudaErrors(cudaMalloc((void **)&d_filter_kernel, mem_size));

    // Copy host memory to device
    checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size,
                               cudaMemcpyHostToDevice));

    // Create one CUFFT plan for the forward transforms, and one for the reverse transform
    // with load callback.
    cufftHandle plan, cb_plan;
    size_t work_size;

    checkCudaErrors(cufftCreate(&plan));
    checkCudaErrors(cufftCreate(&cb_plan));

    checkCudaErrors(cufftMakePlan1d(plan, new_size, CUFFT_C2C, 1, &work_size));
    checkCudaErrors(cufftMakePlan1d(cb_plan, new_size, CUFFT_C2C, 1, &work_size));

    // Define a structure used to pass in the device address of the filter kernel, and
    // the scale factor
    cb_params h_params;

    h_params.filter = d_filter_kernel;
    h_params.scale = 1.0f / new_size;

    // Allocate device memory for parameters
    cb_params *d_params;
    checkCudaErrors(cudaMalloc((void **)&d_params, sizeof(cb_params)));

    // Copy host memory to device
    checkCudaErrors(cudaMemcpy(d_params, &h_params, sizeof(cb_params),
                               cudaMemcpyHostToDevice));

    // The host needs to get a copy of the device pointer to the callback
    cufftCallbackLoadC hostCopyOfCallbackPtr;

    checkCudaErrors(cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr,
                                          myOwnCallbackPtr,
                                          sizeof(hostCopyOfCallbackPtr)));

    // Now associate the load callback with the plan.
    cufftResult status = cufftXtSetCallback(cb_plan,
                                            (void **)&hostCopyOfCallbackPtr,
                                            CUFFT_CB_LD_COMPLEX,
                                            (void **)&d_params);
    if (status == CUFFT_LICENSE_ERROR)
    {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        return EXIT_WAIVED;
    }

    checkCudaErrors(cufftXtSetCallback(cb_plan,
                                       (void **)&hostCopyOfCallbackPtr,
                                       CUFFT_CB_LD_COMPLEX,
                                       (void **)&d_params));

    // Transform signal and kernel
    printf("Transforming signal cufftExecC2C\n");
    checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
    checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_filter_kernel, (cufftComplex *)d_filter_kernel, CUFFT_FORWARD));


    // Transform signal back, using the callback to do the pointwise multiply on the way in.
    printf("Transforming signal back cufftExecC2C\n");
    checkCudaErrors(cufftExecC2C(cb_plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));

    // Copy device memory to host
    Complex *h_convolved_signal = h_padded_signal;
    checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size,
                               cudaMemcpyDeviceToHost));

    // Allocate host memory for the convolution result
    Complex *h_convolved_signal_ref = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

    // Convolve on the host
    Convolve(h_signal, SIGNAL_SIZE,
             h_filter_kernel, FILTER_KERNEL_SIZE,
             h_convolved_signal_ref);

    // check result
    bool bTestResult = sdkCompareL2fe((float *)h_convolved_signal_ref, (float *)h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f);

    //Destroy CUFFT context
    checkCudaErrors(cufftDestroy(plan));
    checkCudaErrors(cufftDestroy(cb_plan));

    // cleanup memory
    free(h_signal);
    free(h_filter_kernel);
    free(h_padded_signal);
    free(h_padded_filter_kernel);
    free(h_convolved_signal_ref);
    checkCudaErrors(cudaFree(d_signal));
    checkCudaErrors(cudaFree(d_filter_kernel));
    checkCudaErrors(cudaFree(d_params));

    return bTestResult ? EXIT_SUCCESS : EXIT_FAILURE;
}

// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
            const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data +           0, signal,              signal_size * sizeof(Complex));
    memset(new_data + signal_size,      0, (new_size - signal_size) * sizeof(Complex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data +                    0, filter_kernel + minRadius,                       maxRadius * sizeof(Complex));
    memset(new_data +            maxRadius,                         0, (new_size - filter_kernel_size) * sizeof(Complex));
    memcpy(new_data + new_size - minRadius,             filter_kernel,                       minRadius * sizeof(Complex));
    *padded_filter_kernel = new_data;

    return new_size;
}

////////////////////////////////////////////////////////////////////////////////
// Filtering operations
////////////////////////////////////////////////////////////////////////////////

// Computes convolution on the host
void Convolve(const Complex *signal, int signal_size,
              const Complex *filter_kernel, int filter_kernel_size,
              Complex *filtered_signal)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;

    // Loop over output element indices
    for (int i = 0; i < signal_size; ++i)
    {
        filtered_signal[i].x = filtered_signal[i].y = 0;

        // Loop over convolution indices
        for (int j = - maxRadius + 1; j <= minRadius; ++j)
        {
            int k = i + j;

            if (k >= 0 && k < signal_size)
            {
                filtered_signal[i] = ComplexAdd(filtered_signal[i], ComplexMul(signal[k], filter_kernel[minRadius - j]));
            }
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////

// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
    Complex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "../inc/piestimator.h"

#include <string>
#include <vector>
#include <numeric>
#include <stdexcept>
#include <typeinfo>
#include <cuda_runtime.h>
 #include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <curand.h>
#include <curand_kernel.h>

#include "../inc/cudasharedmem.h"

using std::string;
using std::vector;

// Helper templates to support float and double in same code
template <typename L, typename R>
struct TYPE_IS
{
    static const bool test = false;
};
template <typename L>
struct TYPE_IS<L, L>
{
    static const bool test = true;
};
template <bool, class L, class R>
struct IF
{
    typedef R type;
};
template <class L, class R>
struct IF<true, L, R>
{
    typedef L type;
};

// RNG init kernel
template <typename rngState_t, typename rngDirectionVectors_t>
__global__ void initRNG(rngState_t *const rngStates,
                        rngDirectionVectors_t *const rngDirections,
                        unsigned int numDrawsPerDirection)
{
    // Determine thread ID
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    // Determine offset to avoid overlapping sub-sequences
    unsigned int offset = tid * ((numDrawsPerDirection + step - 1) / step);

    // Initialise the RNG
    curand_init(rngDirections[0], offset, &rngStates[tid]);
    curand_init(rngDirections[1], offset, &rngStates[tid + step]);
}

__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta)
{
    extern __shared__ unsigned int sdata[];

    // Perform first level of reduction:
    // - Write to shared memory
    unsigned int ltid = threadIdx.x;

    sdata[ltid] = in;
    cg::sync(cta);

    // Do reduction in shared mem
    for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
    {
        if (ltid < s)
        {
            sdata[ltid] += sdata[ltid + s];
        }

        cg::sync(cta);
    }

    return sdata[0];
}

__device__ inline void getPoint(float &x, float &y, curandStateSobol32 &state1, curandStateSobol32 &state2)
{
    x = curand_uniform(&state1);
    y = curand_uniform(&state2);
}
__device__ inline void getPoint(double &x, double &y, curandStateSobol64 &state1, curandStateSobol64 &state2)
{
    x = curand_uniform_double(&state1);
    y = curand_uniform_double(&state2);
}

// Estimator kernel
template <typename Real, typename rngState_t>
__global__ void computeValue(unsigned int *const results,
                             rngState_t *const rngStates,
                             const unsigned int numSims)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Determine thread ID
    unsigned int bid = blockIdx.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    // Initialise the RNG
    rngState_t localState1 = rngStates[tid];
    rngState_t localState2 = rngStates[tid + step];

    // Count the number of points which lie inside the unit quarter-circle
    unsigned int pointsInside = 0;

    for (unsigned int i = tid ; i < numSims ; i += step)
    {
        Real x;
        Real y;
        getPoint(x, y, localState1, localState2);
        Real l2norm2 = x * x + y * y;

        if (l2norm2 < static_cast<Real>(1))
        {
            pointsInside++;
        }
    }

    // Reduce within the block
    pointsInside = reduce_sum(pointsInside, cta);

    // Store the result
    if (threadIdx.x == 0)
    {
        results[bid] = pointsInside;
    }
}

template <typename Real>
PiEstimator<Real>::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize)
    : m_numSims(numSims),
      m_device(device),
      m_threadBlockSize(threadBlockSize)
{
}

template <typename Real>
Real PiEstimator<Real>::operator()()
{
    cudaError_t cudaResult = cudaSuccess;
    struct cudaDeviceProp     deviceProperties;
    struct cudaFuncAttributes funcAttributes;

    // Determine type of generator to use (32- or 64-bit)
    typedef typename IF<TYPE_IS<Real, double>::test, curandStateSobol64_t, curandStateSobol32_t>::type             curandStateSobol_sz;
    typedef typename IF<TYPE_IS<Real, double>::test, curandDirectionVectors64_t, curandDirectionVectors32_t>::type curandDirectionVectors_sz;

    // Get device properties
    cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get device properties: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Check precision is valid
    if (typeid(Real) == typeid(double) &&
        (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3)))
    {
        throw std::runtime_error("Device does not have double precision support");
    }

    // Attach to GPU
    cudaResult = cudaSetDevice(m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not set CUDA device: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Determine how to divide the work between cores
    dim3 block;
    dim3 grid;
    block.x = m_threadBlockSize;
    grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

    // Aim to launch around ten or more times as many blocks as there
    // are multiprocessors on the target device.
    unsigned int blocksPerSM = 10;
    unsigned int numSMs      = deviceProperties.multiProcessorCount;

    while (grid.x > 2 * blocksPerSM * numSMs)
    {
        grid.x >>= 1;
    }

    // Get initRNG function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG<curandStateSobol_sz, curandDirectionVectors_sz>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for initRNG kernel");
    }

    // Get computeValue function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real, curandStateSobol_sz>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for computeValue kernel");
    }

    // Check the dimensions are valid
    if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0])
    {
        throw std::runtime_error("Block X dimension is too large for device");
    }

    if (grid.x > (unsigned int)deviceProperties.maxGridSize[0])
    {
        throw std::runtime_error("Grid X dimension is too large for device");
    }

    // Allocate memory for RNG states and direction vectors
    curandStateSobol_sz       *d_rngStates     = 0;
    curandDirectionVectors_sz *d_rngDirections = 0;
    cudaResult = cudaMalloc((void **)&d_rngStates, 2 * grid.x * block.x * sizeof(curandStateSobol_sz));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for RNG states: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    cudaResult = cudaMalloc((void **)&d_rngDirections, 2 * sizeof(curandDirectionVectors_sz));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for RNG direction vectors: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for result
    // Each thread block will produce one result
    unsigned int *d_results = 0;
    cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for partial results: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Generate direction vectors on the host and copy to the device
    if (typeid(Real) == typeid(float))
    {
        curandDirectionVectors32_t *rngDirections;
        curandStatus_t curandResult = curandGetDirectionVectors32(&rngDirections, CURAND_DIRECTION_VECTORS_32_JOEKUO6);

        if (curandResult != CURAND_STATUS_SUCCESS)
        {
            string msg("Could not get direction vectors for quasi-random number generator: ");
            msg += curandResult;
            throw std::runtime_error(msg);
        }

        cudaResult = cudaMemcpy(d_rngDirections, rngDirections, 2 * sizeof(curandDirectionVectors32_t), cudaMemcpyHostToDevice);

        if (cudaResult != cudaSuccess)
        {
            string msg("Could not copy direction vectors to device: ");
            msg += cudaGetErrorString(cudaResult);
            throw std::runtime_error(msg);
        }
    }
    else if (typeid(Real) == typeid(double))
    {
        curandDirectionVectors64_t *rngDirections;
        curandStatus_t curandResult = curandGetDirectionVectors64(&rngDirections, CURAND_DIRECTION_VECTORS_64_JOEKUO6);

        if (curandResult != CURAND_STATUS_SUCCESS)
        {
            string msg("Could not get direction vectors for quasi-random number generator: ");
            msg += curandResult;
            throw std::runtime_error(msg);
        }

        cudaResult = cudaMemcpy(d_rngDirections, rngDirections, 2 * sizeof(curandDirectionVectors64_t), cudaMemcpyHostToDevice);

        if (cudaResult != cudaSuccess)
        {
            string msg("Could not copy direction vectors to device: ");
            msg += cudaGetErrorString(cudaResult);
            throw std::runtime_error(msg);
        }
    }
    else
    {
        string msg("Could not get direction vectors for random number generator of specified type");
        throw std::runtime_error(msg);
    }

    // Initialise RNG
    initRNG<<<grid, block>>>(d_rngStates, d_rngDirections, m_numSims);

    // Count the points inside unit quarter-circle
    computeValue<Real><<<grid, block, block.x *sizeof(unsigned int)>>>(d_results, d_rngStates, m_numSims);

    // Copy partial results back
    vector<unsigned int> results(grid.x);
    cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not copy partial results to host: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Complete sum-reduction on host
    Real value = static_cast<Real>(std::accumulate(results.begin(), results.end(), 0));

    // Determine the proportion of points inside the quarter-circle,
    // i.e. the area of the unit quarter-circle
    value /= m_numSims;

    // Value is currently an estimate of the area of a unit quarter-circle, so we can
    // scale to a full circle by multiplying by four. Now since the area of a circle
    // is pi * r^2, and r is one, the value will be an estimate for the value of pi.
    value *= 4;

    // Cleanup
    if (d_rngStates)
    {
        cudaFree(d_rngStates);
        d_rngStates = 0;
    }

    if (d_rngDirections)
    {
        cudaFree(d_rngDirections);
        d_rngDirections = 0;
    }

    if (d_results)
    {
        cudaFree(d_results);
        d_results = 0;
    }

    return value;
}

// Explicit template instantiation
template class PiEstimator<float>;
template class PiEstimator<double>;
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "../inc/pricingengine.h"

#include <string>
#include <vector>
#include <numeric>
#include <stdexcept>
#include <typeinfo>
#include <cuda_runtime.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <curand_kernel.h>

#include "../inc/asianoption.h"
#include "../inc/cudasharedmem.h"

using std::string;
using std::vector;

// RNG init kernel
__global__ void initRNG(curandState *const rngStates,
                        const unsigned int seed)
{
    // Determine thread ID
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // Initialise the RNG
    curand_init(seed, tid, 0, &rngStates[tid]);
}

__device__ inline float getPathStep(float &drift, float &diffusion, curandState &state)
{
    return expf(drift + diffusion * curand_normal(&state));
}
__device__ inline double getPathStep(double &drift, double &diffusion, curandState &state)
{
    return exp(drift + diffusion * curand_normal_double(&state));
}

// Path generation kernel
template <typename Real>
__global__ void generatePaths(Real *const paths,
                              curandState *const rngStates,
                              const AsianOption<Real> *const option,
                              const unsigned int numSims,
                              const unsigned int numTimesteps)
{
    // Determine thread ID
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    // Compute parameters
    Real drift     = (option->r - static_cast<Real>(0.5) * option->sigma * option->sigma) * option->dt;
    Real diffusion = option->sigma * sqrt(option->dt);

    // Initialise the RNG
    curandState localState = rngStates[tid];

    for (unsigned int i = tid ; i < numSims ; i += step)
    {
        // Shift the output pointer
        Real *output = paths + i;

        // Simulate the path
        Real s = static_cast<Real>(1);

        for (unsigned int t = 0 ; t < numTimesteps ; t++, output += numSims)
        {
            s *= getPathStep(drift, diffusion, localState);
            *output = s;
        }
    }
}

template <typename Real>
__device__ Real reduce_sum(Real in, cg::thread_block cta)
{
    SharedMemory<Real> sdata;

    // Perform first level of reduction:
    // - Write to shared memory
    unsigned int ltid = threadIdx.x;

    sdata[ltid] = in;
    cg::sync(cta);

    // Do reduction in shared mem
    for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
    {
        if (ltid < s)
        {
            sdata[ltid] += sdata[ltid + s];
        }

        cg::sync(cta);
    }

    return sdata[0];
}

// Valuation kernel
template <typename Real>
__global__ void computeValue(Real *const values,
                             const Real *const paths,
                             const AsianOption<Real> *const option,
                             const unsigned int numSims,
                             const unsigned int numTimesteps)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Determine thread ID
    unsigned int bid = blockIdx.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    Real sumPayoffs = static_cast<Real>(0);

    for (unsigned int i = tid ; i < numSims ; i += step)
    {
        // Shift the input pointer
        const Real *path = paths + i;
        // Compute the arithmetic average
        Real avg = static_cast<Real>(0);

        for (unsigned int t = 0 ; t < numTimesteps ; t++, path += numSims)
        {
            avg += *path;
        }

        avg = avg * option->spot / numTimesteps;
        // Compute the payoff
        Real payoff = avg - option->strike;

        if (option->type == AsianOption<Real>::Put)
        {
            payoff = - payoff;
        }

        payoff = max(static_cast<Real>(0), payoff);
        // Accumulate payoff locally
        sumPayoffs += payoff;
    }

    // Reduce within the block
    sumPayoffs = reduce_sum<Real>(sumPayoffs, cta);

    // Store the result
    if (threadIdx.x == 0)
    {
        values[bid] = sumPayoffs;
    }
}

template <typename Real>
PricingEngine<Real>::PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed)
    : m_numSims(numSims),
      m_device(device),
      m_threadBlockSize(threadBlockSize),
      m_seed(seed)
{
}

template <typename Real>
void PricingEngine<Real>::operator()(AsianOption<Real> &option)
{
    cudaError_t cudaResult = cudaSuccess;
    struct cudaDeviceProp     deviceProperties;
    struct cudaFuncAttributes funcAttributes;

    // Get device properties
    cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get device properties: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Check precision is valid
    unsigned int deviceVersion = deviceProperties.major * 10 + deviceProperties.minor;

    if (typeid(Real) == typeid(double) && deviceVersion < 13)
    {
        throw std::runtime_error("Device does not have double precision support");
    }

    // Attach to GPU
    cudaResult = cudaSetDevice(m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not set CUDA device: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Determine how to divide the work between cores
    dim3 block;
    dim3 grid;
    block.x = m_threadBlockSize;
    grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

    // Aim to launch around ten or more times as many blocks as there
    // are multiprocessors on the target device.
    unsigned int blocksPerSM = 10;
    unsigned int numSMs      = deviceProperties.multiProcessorCount;

    while (grid.x > 2 * blocksPerSM * numSMs)
    {
        grid.x >>= 1;
    }

    // Get initRNG function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for initRNG kernel");
    }

    // Get generatePaths function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths<Real>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for generatePaths kernel");
    }

    // Get computeValue function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for computeValue kernel");
    }

    // Setup problem on GPU
    AsianOption<Real> *d_option = 0;
    cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption<Real>));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for option data: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption<Real>), cudaMemcpyHostToDevice);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not copy data to device: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for paths
    Real *d_paths  = 0;
    int numTimesteps = static_cast<int>(option.tenor / option.dt);
    cudaResult = cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for paths: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for RNG states
    curandState *d_rngStates = 0;
    cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for RNG state: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for result
    Real *d_values = 0;
    cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for partial results: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Initialise RNG
    initRNG<<<grid, block>>>(d_rngStates, m_seed);

    // Generate paths
    generatePaths<Real><<<grid, block>>>(d_paths, d_rngStates, d_option, m_numSims, numTimesteps);

    // Compute value
    computeValue<<<grid, block, block.x *sizeof(Real)>>>(d_values, d_paths, d_option, m_numSims, numTimesteps);

    // Copy partial results back
    vector<Real> values(grid.x);
    cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), cudaMemcpyDeviceToHost);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not copy partial results to host: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Complete sum-reduction on host
    option.value = std::accumulate(values.begin(), values.end(), static_cast<Real>(0));

    // Compute the mean
    option.value /= m_numSims;

    // Discount to present value
    option.value *= exp(- option.r * option.tenor);

    // Cleanup
    if (d_option)
    {
        cudaFree(d_option);
        d_option = 0;
    }

    if (d_paths)
    {
        cudaFree(d_paths);
        d_paths = 0;
    }

    if (d_rngStates)
    {
        cudaFree(d_rngStates);
        d_rngStates = 0;
    }

    if (d_values)
    {
        cudaFree(d_values);
        d_values = 0;
    }
}

// Explicit template instantiation
template class PricingEngine<float>;
template class PricingEngine<double>;
/**
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

////////////////////////////////////////////////////////////////////////////////
//
//  simpleCUFFT_2d_MGPU.cu
//
//  This sample code demonstrate the use of CUFFT library for 2D data on multiple GPU.
//  Example showing the use of CUFFT for solving 2D-POISSON equation using FFT on multiple GPU.
//  For reference we have used the equation given in http://www.bu.edu/pasi/files/2011/07/
//  Lecture83.pdf
//
////////////////////////////////////////////////////////////////////////////////


// System includes
#include <stdlib.h>
#include <stdio.h>

#include <string.h>
#include <math.h>

// CUDA runtime
#include <cuda_runtime.h>

//CUFFT Header file
#include <cufftXt.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

// Complex data type
typedef float2 Complex;

// Data configuration
const int GPU_COUNT = 2;
const int BSZ_Y     = 4;
const int BSZ_X     = 4;

// Forward Declaration
void  solvePoissonEquation(cudaLibXtDesc *, cudaLibXtDesc *, float **, int, int );

__global__ void solvePoisson(cufftComplex *, cufftComplex *, float *, int, int, int n_gpu);

///////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("\nPoisson equation using CUFFT library on Multiple GPUs is starting...\n\n");

    int GPU_N ;
    checkCudaErrors(cudaGetDeviceCount(&GPU_N));

    if (GPU_N < GPU_COUNT)
    {
        printf("No. of GPU on node %d\n", GPU_N);
        printf("Two GPUs are required to run simpleCUFFT_2d_MGPU sample code\n");
        exit(EXIT_WAIVED);
    }


    int *major_minor =  (int *) malloc(sizeof(int)*GPU_N*2);
    int found2IdenticalGPUs = 0;
    int nGPUs = 2;
    int *whichGPUs ;
    whichGPUs = (int*) malloc(sizeof(int) * nGPUs);

    for(int i=0; i<GPU_N; i++)
    {
        cudaDeviceProp deviceProp;
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i));
        major_minor[i*2] = deviceProp.major;
        major_minor[i*2 + 1] = deviceProp.minor;
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    for (int i=0; i<GPU_N; i++)
    {
        for (int j=i+1; j<GPU_N; j++)
        {
            if((major_minor[i*2] == major_minor[j*2]) && (major_minor[i*2 + 1] == major_minor[j*2 + 1]))
            {
                whichGPUs[0] = i;
                whichGPUs[1] = j;
                found2IdenticalGPUs = 1;
                break;
            }
        }
        if (found2IdenticalGPUs)
        {
            break;
        }
    }

    free(major_minor);
    if (!found2IdenticalGPUs)
    {
        printf("No Two GPUs with same architecture found\nWaiving simpleCUFFT_2d_MGPU sample\n");
        exit(EXIT_WAIVED);
    }

    int N = 64;
    float xMAX = 1.0f, xMIN = 0.0f, yMIN = 0.0f,h = (xMAX - xMIN)/((float)N), s = 0.1, s2 = s*s;
    float *x, *y, *f, *u_a, r2;

    x = (float*)malloc(sizeof(float) *N*N);
    y = (float*)malloc(sizeof(float) *N*N);
    f = (float*)malloc(sizeof(float) *N*N);
    u_a = (float*)malloc(sizeof(float) *N*N);

    for (int j=0; j<N; j++)
    for (int i=0; i<N; i++)
    {
        x[N*j+i] = xMIN + i*h;
        y[N*j+i] = yMIN + j*h;
        r2 = (x[N*j+i] - 0.5)*(x[N*j+i] - 0.5) + (y[N*j+i] - 0.5)*(y[N*j+i] - 0.5);
        f[N*j+i] = (r2-2*s2)/(s2*s2)*exp(-r2/(2*s2));
        u_a[N*j+i] = exp(-r2/(2*s2)); // analytical solution

    }

    float *k, *d_k[GPU_COUNT];
    k = (float*)malloc(sizeof(float) *N);
    for (int i=0; i<=N/2; i++)
    {
        k[i] = i * 2*M_PI;
    }
    for (int i=N/2+1; i<N; i++)
    {
        k[i] = (i - N) * 2*M_PI;
    }

    //Create a complex variable on host
    Complex *h_f = (Complex *)malloc(sizeof(Complex) * N * N);

    // Initialize the memory for the signal
    for ( int i = 0; i < (N * N); i++)
    {
        h_f[i].x = f[i];
        h_f[i].y = 0.0f;
    }

    // cufftCreate() - Create an empty plan
    cufftResult result;
    cufftHandle planComplex;
    result = cufftCreate(&planComplex);
    if (result != CUFFT_SUCCESS) { printf ("cufftCreate failed\n"); exit (EXIT_FAILURE); }

    // cufftXtSetGPUs() - Define which GPUs to use
    result = cufftXtSetGPUs (planComplex, nGPUs, whichGPUs);

    if (result == CUFFT_INVALID_DEVICE)
    {
        printf ("This sample requires two GPUs on the same board.\n");
        printf ("No such board was found. Waiving sample.\n");
        exit (EXIT_WAIVED);
    }
    else if (result != CUFFT_SUCCESS)
    {
        printf ("cufftXtSetGPUs failed\n"); exit (EXIT_FAILURE);
    }

    //Print the device information to run the code
    printf("\nRunning on GPUs\n");
    for (int i = 0 ; i < 2 ; i++)
    {
        cudaDeviceProp deviceProp;
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, whichGPUs[i]));
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", whichGPUs[i], deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    size_t* worksize;
    worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);

    // cufftMakePlan2d() - Create the plan
    result = cufftMakePlan2d(planComplex, N, N, CUFFT_C2C, worksize);
    if (result != CUFFT_SUCCESS) { printf ("*MakePlan* failed\n"); exit (EXIT_FAILURE) ; }

    for(int i=0; i<nGPUs; i++)
    {
        cudaSetDevice(whichGPUs[i]);
        cudaMalloc ((void**)&d_k[i], sizeof(float)*N);
        cudaMemcpy(d_k[i], k, sizeof(float)*N, cudaMemcpyHostToDevice);
    }

    // Create a variable on device
    // d_f - variable on device to store the input data
    // d_d_f - variable that store the natural order of d_f data
    // d_out - device output
    cudaLibXtDesc *d_f,*d_d_f, *d_out ;

    // cufftXtMalloc() - Malloc data on multiple GPUs

    result = cufftXtMalloc (planComplex, (cudaLibXtDesc **)&d_f, CUFFT_XT_FORMAT_INPLACE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMalloc failed\n"); exit (EXIT_FAILURE) ; }

    result = cufftXtMalloc (planComplex, (cudaLibXtDesc **)&d_d_f, CUFFT_XT_FORMAT_INPLACE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMalloc failed\n"); exit (EXIT_FAILURE) ; }

    result = cufftXtMalloc (planComplex, (cudaLibXtDesc **)&d_out, CUFFT_XT_FORMAT_INPLACE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMalloc failed\n"); exit (EXIT_FAILURE) ; }

    // cufftXtMemcpy() - Copy the data from host to device
    result = cufftXtMemcpy (planComplex, d_f, h_f, CUFFT_COPY_HOST_TO_DEVICE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMemcpy failed\n"); exit (EXIT_FAILURE); }

    // cufftXtExecDescriptorC2C() - Execute FFT on data on multiple GPUs
    printf("Forward 2d FFT on multiple GPUs\n");
    result = cufftXtExecDescriptorC2C(planComplex, d_f, d_f, CUFFT_FORWARD);
    if (result != CUFFT_SUCCESS) { printf ("*XtExecC2C  failed\n"); exit (EXIT_FAILURE); }

    //cufftXtMemcpy() - Copy the data to natural order on GPUs
    result = cufftXtMemcpy (planComplex, d_d_f, d_f, CUFFT_COPY_DEVICE_TO_DEVICE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMemcpy failed\n"); exit (EXIT_FAILURE); }

    printf("Solve Poisson Equation\n" );
    solvePoissonEquation(d_d_f, d_out, d_k, N, nGPUs);

    printf("Inverse 2d FFT on multiple GPUs\n");
    // cufftXtExecDescriptorC2C() - Execute inverse  FFT on data on multiple GPUs
    result = cufftXtExecDescriptorC2C(planComplex, d_out,  d_out, CUFFT_INVERSE);
    if (result != CUFFT_SUCCESS) { printf ("*XtExecC2C  failed\n"); exit (EXIT_FAILURE); }

    //Create a variable on host to copy the data from device
    //h_d_out - variable store the output of device
    Complex *h_d_out = (Complex *)malloc(sizeof(Complex) * N * N);

    // cufftXtMemcpy() - Copy data from multiple GPUs to host
    result = cufftXtMemcpy (planComplex,h_d_out, d_out, CUFFT_COPY_DEVICE_TO_HOST);
    if (result != CUFFT_SUCCESS) { printf ("*XtMemcpy failed\n"); exit (EXIT_FAILURE); }

    float *out = (float *)malloc(sizeof(float) * N * N);
    float constant = h_d_out[0].x / N* N ;
    for (int i=0; i<N*N; i++)
    {
        //subtract u[0] to force the arbitrary constant to be 0
        out[i] = (h_d_out[i].x / (N*N)) - constant;
    }

    // cleanup memory

    free(h_f);
    free(k);
    free(out);
    free(h_d_out);
    free(x);
    free(whichGPUs);
    free(y);
    free(f);
    free(u_a);
    free(worksize);

    // cudaXtFree() - Free GPU memory
    for (int i=0; i<GPU_COUNT; i++) {
        cudaFree(d_k[i]);
    }
    result = cufftXtFree(d_out);
    if (result != CUFFT_SUCCESS) { printf ("*XtFree failed\n"); exit (EXIT_FAILURE); }
    result = cufftXtFree(d_f);
    if (result != CUFFT_SUCCESS) { printf ("*XtFree failed\n"); exit (EXIT_FAILURE); }
    result = cufftXtFree(d_d_f);
    if (result != CUFFT_SUCCESS) { printf ("*XtFree failed\n"); exit (EXIT_FAILURE); }

    // cufftDestroy() - Destroy FFT plan
    result = cufftDestroy(planComplex);
    if (result != CUFFT_SUCCESS) { printf ("cufftDestroy failed: code %d\n",(int)result); exit (EXIT_FAILURE); }

    exit(EXIT_SUCCESS);
}

////////////////////////////////////////////////////////////////////////////////////
//Launch kernel on  multiple GPU
///////////////////////////////////////////////////////////////////////////////////
void  solvePoissonEquation(cudaLibXtDesc *d_ft,cudaLibXtDesc *d_ft_k, float **k, int N, int nGPUs)
{
    int device ;
    dim3 dimGrid (int(N/BSZ_X), int((N/2)/BSZ_Y));
    dim3 dimBlock (BSZ_X, BSZ_Y);

    for(int i=0; i < nGPUs ; i++)
    {
        device = d_ft_k->descriptor->GPUs[i];
        cudaSetDevice(device) ;
        solvePoisson<<<dimGrid,dimBlock>>>((cufftComplex*) d_ft->descriptor->data[i],
                                           (cufftComplex*) d_ft_k->descriptor->data[i],
                                           k[i], N, i, nGPUs);
    }

    // Wait for device to finish all operation
    for(int i=0; i< nGPUs ; i++)
    {
        device = d_ft_k->descriptor->GPUs[i];
        cudaSetDevice( device );
        cudaDeviceSynchronize();

        // Check if kernel execution generated and error
        getLastCudaError("Kernel execution failed [ solvePoisson ]");
    }

}


////////////////////////////////////////////////////////////////////////////////
// Kernel for Solving Poisson equation on GPU
////////////////////////////////////////////////////////////////////////////////
__global__ void solvePoisson(cufftComplex *ft, cufftComplex *ft_k, float *k, int N ,int gpu_id, int n_gpu)
{
     int i = threadIdx.x + blockIdx.x*blockDim.x;
     int j = threadIdx.y + blockIdx.y*blockDim.y;
     int index = j*N+i;
     if (i<N && j<N/n_gpu)
     {
         float k2 = k[i]*k[i] + k[j + gpu_id*N/n_gpu]*k[j + gpu_id*N/n_gpu];
         if (i==0 && j==0 && gpu_id == 0)
         {
             k2 = 1.0f;
         }

         ft_k[index].x = -ft[index].x*1/k2;
         ft_k[index].y = -ft[index].y*1/k2;
    }
}


/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "../inc/piestimator.h"

#include <string>
#include <vector>
#include <numeric>
#include <stdexcept>
#include <typeinfo>
#include <cuda_runtime.h>
 #include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <curand.h>

using std::string;
using std::vector;

__device__ unsigned int reduce_sum(unsigned int in, cg::thread_block cta)
{
    extern __shared__ unsigned int sdata[];

    // Perform first level of reduction:
    // - Write to shared memory
    unsigned int ltid = threadIdx.x;

    sdata[ltid] = in;
    cg::sync(cta);

    // Do reduction in shared mem
    for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
    {
        if (ltid < s)
        {
            sdata[ltid] += sdata[ltid + s];
        }

        cg::sync(cta);
    }

    return sdata[0];
}

// Estimator kernel
template <typename Real>
__global__ void computeValue(unsigned int *const results,
                             const Real *const points,
                             const unsigned int numSims)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Determine thread ID
    unsigned int bid = blockIdx.x;
    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int step = gridDim.x * blockDim.x;

    // Shift the input/output pointers
    const Real *pointx = points + tid;
    const Real *pointy = pointx + numSims;

    // Count the number of points which lie inside the unit quarter-circle
    unsigned int pointsInside = 0;

    for (unsigned int i = tid ; i < numSims ; i += step, pointx += step, pointy += step)
    {
        Real x = *pointx;
        Real y = *pointy;
        Real l2norm2 = x * x + y * y;

        if (l2norm2 < static_cast<Real>(1))
        {
            pointsInside++;
        }
    }

    // Reduce within the block
    pointsInside = reduce_sum(pointsInside, cta);

    // Store the result
    if (threadIdx.x == 0)
    {
        results[bid] = pointsInside;
    }
}

template <typename Real>
PiEstimator<Real>::PiEstimator(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed)
    : m_numSims(numSims),
      m_device(device),
      m_threadBlockSize(threadBlockSize),
      m_seed(seed)
{
}

template <typename Real>
Real PiEstimator<Real>::operator()()
{
    cudaError_t cudaResult = cudaSuccess;
    struct cudaDeviceProp     deviceProperties;
    struct cudaFuncAttributes funcAttributes;

    // Get device properties
    cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get device properties: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Check precision is valid
    if (typeid(Real) == typeid(double) &&
        (deviceProperties.major < 1 || (deviceProperties.major == 1 && deviceProperties.minor < 3)))
    {
        throw std::runtime_error("Device does not have double precision support");
    }

    // Attach to GPU
    cudaResult = cudaSetDevice(m_device);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not set CUDA device: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Determine how to divide the work between cores
    dim3 block;
    dim3 grid;
    block.x = m_threadBlockSize;
    grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

    // Aim to launch around ten or more times as many blocks as there
    // are multiprocessors on the target device.
    unsigned int blocksPerSM = 10;
    unsigned int numSMs      = deviceProperties.multiProcessorCount;

    while (grid.x > 2 * blocksPerSM * numSMs)
    {
        grid.x >>= 1;
    }

    // Get computeValue function properties and check the maximum block size
    cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not get function attributes: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
    {
        throw std::runtime_error("Block X dimension is too large for computeValue kernel");
    }

    // Check the dimensions are valid
    if (block.x > (unsigned int)deviceProperties.maxThreadsDim[0])
    {
        throw std::runtime_error("Block X dimension is too large for device");
    }

    if (grid.x > (unsigned int)deviceProperties.maxGridSize[0])
    {
        throw std::runtime_error("Grid X dimension is too large for device");
    }

    // Allocate memory for points
    // Each simulation has two random numbers to give X and Y coordinate
    Real *d_points = 0;
    cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(Real));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for random numbers: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Allocate memory for result
    // Each thread block will produce one result
    unsigned int *d_results = 0;
    cudaResult = cudaMalloc((void **)&d_results, grid.x * sizeof(unsigned int));

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not allocate memory on device for partial results: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Generate random points in unit square
    curandStatus_t curandResult;
    curandGenerator_t prng;
    curandResult = curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not create pseudo-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    curandResult = curandSetPseudoRandomGeneratorSeed(prng, m_seed);

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not set seed for pseudo-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    if (typeid(Real) == typeid(float))
    {
        curandResult = curandGenerateUniform(prng, (float *)d_points, 2 * m_numSims);
    }
    else if (typeid(Real) == typeid(double))
    {
        curandResult = curandGenerateUniformDouble(prng, (double *)d_points, 2 * m_numSims);
    }
    else
    {
        string msg("Could not generate random numbers of specified type");
        throw std::runtime_error(msg);
    }

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not generate pseudo-random numbers: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    curandResult = curandDestroyGenerator(prng);

    if (curandResult != CURAND_STATUS_SUCCESS)
    {
        string msg("Could not destroy pseudo-random number generator: ");
        msg += curandResult;
        throw std::runtime_error(msg);
    }

    // Count the points inside unit quarter-circle
    computeValue<Real><<<grid, block, block.x *sizeof(unsigned int)>>>(d_results, d_points, m_numSims);

    // Copy partial results back
    vector<unsigned int> results(grid.x);
    cudaResult = cudaMemcpy(&results[0], d_results, grid.x * sizeof(unsigned int), cudaMemcpyDeviceToHost);

    if (cudaResult != cudaSuccess)
    {
        string msg("Could not copy partial results to host: ");
        msg += cudaGetErrorString(cudaResult);
        throw std::runtime_error(msg);
    }

    // Complete sum-reduction on host
    Real value = static_cast<Real>(std::accumulate(results.begin(), results.end(), 0));

    // Determine the proportion of points inside the quarter-circle,
    // i.e. the area of the unit quarter-circle
    value /= m_numSims;

    // Value is currently an estimate of the area of a unit quarter-circle, so we can
    // scale to a full circle by multiplying by four. Now since the area of a circle
    // is pi * r^2, and r is one, the value will be an estimate for the value of pi.
    value *= 4;

    // Cleanup
    if (d_points)
    {
        cudaFree(d_points);
        d_points = 0;
    }

    if (d_results)
    {
        cudaFree(d_results);
        d_results = 0;
    }

    return value;
}

// Explicit template instantiation
template class PiEstimator<float>;
template class PiEstimator<double>;
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

///////////////////////////////////////////////////////////////////////////////
// Polynomial approximation of cumulative normal distribution function
///////////////////////////////////////////////////////////////////////////////

__device__ inline float cndGPU(float d)
{
    const float       A1 = 0.31938153f;
    const float       A2 = -0.356563782f;
    const float       A3 = 1.781477937f;
    const float       A4 = -1.821255978f;
    const float       A5 = 1.330274429f;
    const float RSQRT2PI = 0.39894228040143267793994605993438f;

    float
    K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));

    float
    cnd = RSQRT2PI * __expf(- 0.5f * d * d) *
          (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));

    if (d > 0)
        cnd = 1.0f - cnd;

    return cnd;
}


///////////////////////////////////////////////////////////////////////////////
// Black-Scholes formula for both call and put
///////////////////////////////////////////////////////////////////////////////
__device__ inline void BlackScholesBodyGPU(
    float &CallResult,
    float &PutResult,
    float S, //Stock price
    float X, //Option strike
    float T, //Option years
    float R, //Riskless rate
    float V  //Volatility rate
)
{
    float sqrtT, expRT;
    float d1, d2, CNDD1, CNDD2;

    sqrtT = __fdividef(1.0F, rsqrtf(T));
    d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
    d2 = d1 - V * sqrtT;

    CNDD1 = cndGPU(d1);
    CNDD2 = cndGPU(d2);

    //Calculate Call and Put simultaneously
    expRT = __expf(- R * T);
    CallResult = S * CNDD1 - X * expRT * CNDD2;
    PutResult  = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
}


////////////////////////////////////////////////////////////////////////////////
//Process an array of optN options on GPU
////////////////////////////////////////////////////////////////////////////////
extern "C" __launch_bounds__(128) __global__ void BlackScholesGPU(
    float2 *__restrict d_CallResult,
    float2 *__restrict d_PutResult,
    float2 *__restrict d_StockPrice,
    float2 *__restrict d_OptionStrike,
    float2 *__restrict d_OptionYears,
    float Riskfree,
    float Volatility,
    int optN
)
{
    ////Thread index
    const int opt = blockDim.x * blockIdx.x + threadIdx.x;

    // Calculating 2 options per thread to increase ILP (instruction level parallelism)
    if (opt < (optN / 2))
    {
        float callResult1, callResult2;
        float putResult1, putResult2;
        BlackScholesBodyGPU(
            callResult1,
            putResult1,
            d_StockPrice[opt].x,
            d_OptionStrike[opt].x,
            d_OptionYears[opt].x,
            Riskfree,
            Volatility
            );
        BlackScholesBodyGPU(
            callResult2,
            putResult2,
            d_StockPrice[opt].y,
            d_OptionStrike[opt].y,
            d_OptionYears[opt].y,
            Riskfree,
            Volatility
            );
        d_CallResult[opt] = make_float2(callResult1, callResult2);
        d_PutResult[opt] = make_float2(putResult1, putResult2);
    }
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample evaluates fair call and put prices for a
 * given set of European options by Black-Scholes formula.
 * See supplied whitepaper for more explanations.
 */


#include <helper_functions.h>   // helper functions for string parsing
#include <helper_cuda.h>        // helper functions CUDA error checking and initialization

////////////////////////////////////////////////////////////////////////////////
// Process an array of optN options on CPU
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholesCPU(
    float *h_CallResult,
    float *h_PutResult,
    float *h_StockPrice,
    float *h_OptionStrike,
    float *h_OptionYears,
    float Riskfree,
    float Volatility,
    int optN
);

////////////////////////////////////////////////////////////////////////////////
// Process an array of OptN options on GPU
////////////////////////////////////////////////////////////////////////////////
#include "BlackScholes_kernel.cuh"

////////////////////////////////////////////////////////////////////////////////
// Helper function, returning uniformly distributed
// random float in [low, high] range
////////////////////////////////////////////////////////////////////////////////
float RandFloat(float low, float high)
{
    float t = (float)rand() / (float)RAND_MAX;
    return (1.0f - t) * low + t * high;
}

////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
const int OPT_N = 4000000;
const int  NUM_ITERATIONS = 512;


const int          OPT_SZ = OPT_N * sizeof(float);
const float      RISKFREE = 0.02f;
const float    VOLATILITY = 0.30f;

#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )

////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // Start logs
    printf("[%s] - Starting...\n", argv[0]);

    //'h_' prefix - CPU (host) memory space
    float
    //Results calculated by CPU for reference
    *h_CallResultCPU,
    *h_PutResultCPU,
    //CPU copy of GPU results
    *h_CallResultGPU,
    *h_PutResultGPU,
    //CPU instance of input data
    *h_StockPrice,
    *h_OptionStrike,
    *h_OptionYears;

    //'d_' prefix - GPU (device) memory space
    float
    //Results calculated by GPU
    *d_CallResult,
    *d_PutResult,
    //GPU instance of input data
    *d_StockPrice,
    *d_OptionStrike,
    *d_OptionYears;

    double
    delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;
    int i;

    findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    printf("...allocating CPU memory for options.\n");
    h_CallResultCPU = (float *)malloc(OPT_SZ);
    h_PutResultCPU  = (float *)malloc(OPT_SZ);
    h_CallResultGPU = (float *)malloc(OPT_SZ);
    h_PutResultGPU  = (float *)malloc(OPT_SZ);
    h_StockPrice    = (float *)malloc(OPT_SZ);
    h_OptionStrike  = (float *)malloc(OPT_SZ);
    h_OptionYears   = (float *)malloc(OPT_SZ);

    printf("...allocating GPU memory for options.\n");
    checkCudaErrors(cudaMalloc((void **)&d_CallResult,   OPT_SZ));
    checkCudaErrors(cudaMalloc((void **)&d_PutResult,    OPT_SZ));
    checkCudaErrors(cudaMalloc((void **)&d_StockPrice,   OPT_SZ));
    checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ));
    checkCudaErrors(cudaMalloc((void **)&d_OptionYears,  OPT_SZ));

    printf("...generating input data in CPU mem.\n");
    srand(5347);

    //Generate options set
    for (i = 0; i < OPT_N; i++)
    {
        h_CallResultCPU[i] = 0.0f;
        h_PutResultCPU[i]  = -1.0f;
        h_StockPrice[i]    = RandFloat(5.0f, 30.0f);
        h_OptionStrike[i]  = RandFloat(1.0f, 100.0f);
        h_OptionYears[i]   = RandFloat(0.25f, 10.0f);
    }

    printf("...copying input data to GPU mem.\n");
    //Copy options data to GPU memory for further processing
    checkCudaErrors(cudaMemcpy(d_StockPrice,  h_StockPrice,   OPT_SZ, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike,  OPT_SZ, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_OptionYears,  h_OptionYears,   OPT_SZ, cudaMemcpyHostToDevice));
    printf("Data init done.\n\n");


    printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS);
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    for (i = 0; i < NUM_ITERATIONS; i++)
    {
        BlackScholesGPU<<<DIV_UP((OPT_N/2), 128), 128/*480, 128*/>>>(
            (float2 *)d_CallResult,
            (float2 *)d_PutResult,
            (float2 *)d_StockPrice,
            (float2 *)d_OptionStrike,
            (float2 *)d_OptionYears,
            RISKFREE,
            VOLATILITY,
            OPT_N
        );
        getLastCudaError("BlackScholesGPU() execution failed\n");
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
    printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
    printf("Gigaoptions per second    : %f     \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));

    printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n",
           (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128);

    printf("\nReading back GPU results...\n");
    //Read back GPU results to compare them to CPU results
    checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_PutResultGPU,  d_PutResult,  OPT_SZ, cudaMemcpyDeviceToHost));


    printf("Checking the results...\n");
    printf("...running CPU calculations.\n\n");
    //Calculate options values on CPU
    BlackScholesCPU(
        h_CallResultCPU,
        h_PutResultCPU,
        h_StockPrice,
        h_OptionStrike,
        h_OptionYears,
        RISKFREE,
        VOLATILITY,
        OPT_N
    );

    printf("Comparing the results...\n");
    //Calculate max absolute difference and L1 distance
    //between CPU and GPU results
    sum_delta = 0;
    sum_ref   = 0;
    max_delta = 0;

    for (i = 0; i < OPT_N; i++)
    {
        ref   = h_CallResultCPU[i];
        delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);

        if (delta > max_delta)
        {
            max_delta = delta;
        }

        sum_delta += delta;
        sum_ref   += fabs(ref);
    }

    L1norm = sum_delta / sum_ref;
    printf("L1 norm: %E\n", L1norm);
    printf("Max absolute error: %E\n\n", max_delta);

    printf("Shutting down...\n");
    printf("...releasing GPU memory.\n");
    checkCudaErrors(cudaFree(d_OptionYears));
    checkCudaErrors(cudaFree(d_OptionStrike));
    checkCudaErrors(cudaFree(d_StockPrice));
    checkCudaErrors(cudaFree(d_PutResult));
    checkCudaErrors(cudaFree(d_CallResult));

    printf("...releasing CPU memory.\n");
    free(h_OptionYears);
    free(h_OptionStrike);
    free(h_StockPrice);
    free(h_PutResultGPU);
    free(h_CallResultGPU);
    free(h_PutResultCPU);
    free(h_CallResultCPU);
    sdkDeleteTimer(&hTimer);
    printf("Shutdown done.\n");

    printf("\n[BlackScholes] - Test Summary\n");

    if (L1norm > 1e-6)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n");
    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


///////////////////////////////////////////////////////////////////////////////
// Polynomial approximation of cumulative normal distribution function
///////////////////////////////////////////////////////////////////////////////
__device__ inline float cndGPU(float d)
{
    const float       A1 = 0.31938153f;
    const float       A2 = -0.356563782f;
    const float       A3 = 1.781477937f;
    const float       A4 = -1.821255978f;
    const float       A5 = 1.330274429f;
    const float RSQRT2PI = 0.39894228040143267793994605993438f;

    float
    K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));

    float
    cnd = RSQRT2PI * __expf(- 0.5f * d * d) *
          (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));

    if (d > 0)
        cnd = 1.0f - cnd;

    return cnd;
}


///////////////////////////////////////////////////////////////////////////////
// Black-Scholes formula for both call and put
///////////////////////////////////////////////////////////////////////////////
__device__ inline void BlackScholesBodyGPU(
    float &CallResult,
    float &PutResult,
    float S, //Stock price
    float X, //Option strike
    float T, //Option years
    float R, //Riskless rate
    float V  //Volatility rate
)
{
    float sqrtT, expRT;
    float d1, d2, CNDD1, CNDD2;

    sqrtT = __fdividef(1.0F, rsqrtf(T));
    d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
    d2 = d1 - V * sqrtT;

    CNDD1 = cndGPU(d1);
    CNDD2 = cndGPU(d2);

    //Calculate Call and Put simultaneously
    expRT = __expf(- R * T);
    CallResult = S * CNDD1 - X * expRT * CNDD2;
    PutResult  = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
}


////////////////////////////////////////////////////////////////////////////////
//Process an array of optN options on GPU
////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(128)
__global__ void BlackScholesGPU(
    float2 * __restrict d_CallResult,
    float2 * __restrict d_PutResult,
    float2 * __restrict d_StockPrice,
    float2 * __restrict d_OptionStrike,
    float2 * __restrict d_OptionYears,
    float Riskfree,
    float Volatility,
    int optN
)
{
    ////Thread index
    //const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
    ////Total number of threads in execution grid
    //const int THREAD_N = blockDim.x * gridDim.x;

    const int opt = blockDim.x * blockIdx.x + threadIdx.x;

     // Calculating 2 options per thread to increase ILP (instruction level parallelism)
    if (opt < (optN / 2))
    {
        float callResult1, callResult2;
        float putResult1, putResult2;
        BlackScholesBodyGPU(
            callResult1,
            putResult1,
            d_StockPrice[opt].x,
            d_OptionStrike[opt].x,
            d_OptionYears[opt].x,
            Riskfree,
            Volatility
        );
        BlackScholesBodyGPU(
            callResult2,
            putResult2,
            d_StockPrice[opt].y,
            d_OptionStrike[opt].y,
            d_OptionYears[opt].y,
            Riskfree,
            Volatility
        );
        d_CallResult[opt] = make_float2(callResult1, callResult2);
        d_PutResult[opt] = make_float2(putResult1, putResult2);
	 }
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include "common_gpu_header.h"
#include "binomialOptions_common.h"
#include "realtype.h"

//Preprocessed input option data
typedef struct
{
    real S;
    real X;
    real vDt;
    real puByDf;
    real pdByDf;
} __TOptionData;
static __constant__ __TOptionData d_OptionData[MAX_OPTIONS];
__device__           real d_CallValue[MAX_OPTIONS];

#define THREADBLOCK_SIZE 128
#define ELEMS_PER_THREAD (NUM_STEPS/THREADBLOCK_SIZE)
#if NUM_STEPS % THREADBLOCK_SIZE
#error Bad constants
#endif


////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut functions for different precision modes
////////////////////////////////////////////////////////////////////////////////

#ifndef DOUBLE_PRECISION
__device__ inline float expiryCallValue(float S, float X, float vDt, int i)
{
    float d = S * __expf(vDt * (2.0f * i - NUM_STEPS)) - X;
    return (d > 0.0F) ? d : 0.0F;
}

#else
__device__ inline double expiryCallValue(double S, double X, double vDt, int i)
{
    double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X;
    return (d > 0.0) ? d : 0.0;
}
#endif

////////////////////////////////////////////////////////////////////////////////
// GPU kernel
////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void binomialOptionsKernel()
{
    __shared__ real call_exchange[THREADBLOCK_SIZE + 1];

    const int     tid = threadIdx.x;
    const real      S = d_OptionData[blockIdx.x].S;
    const real      X = d_OptionData[blockIdx.x].X;
    const real    vDt = d_OptionData[blockIdx.x].vDt;
    const real puByDf = d_OptionData[blockIdx.x].puByDf;
    const real pdByDf = d_OptionData[blockIdx.x].pdByDf;

    real call[ELEMS_PER_THREAD + 1];
    #pragma unroll
    for (int i = 0; i < ELEMS_PER_THREAD; ++i)
        call[i] = expiryCallValue(S, X, vDt, tid * ELEMS_PER_THREAD + i);

    if (tid == 0)
        call_exchange[THREADBLOCK_SIZE] = expiryCallValue(S, X, vDt, NUM_STEPS);

    int final_it = max(0, tid * ELEMS_PER_THREAD - 1);

    #pragma unroll 16
    for (int i = NUM_STEPS; i > 0; --i)
    {
        call_exchange[tid] = call[0];
        __syncthreads();
        call[ELEMS_PER_THREAD] = call_exchange[tid + 1];
        __syncthreads();

        if (i > final_it)
        {
           #pragma unroll
           for(int j = 0; j < ELEMS_PER_THREAD; ++j)
              call[j] = puByDf * call[j + 1] + pdByDf * call[j];
        }
    }

    if (tid == 0)
    {
        d_CallValue[blockIdx.x] = call[0];
    }
}
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

////////////////////////////////////////////////////////////////////////////////
// Global types
////////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <curand_kernel.h>
#include "MonteCarlo_common.h"

////////////////////////////////////////////////////////////////////////////////
// Helper reduction template
// Please see the "reduction" CUDA Sample for more information
////////////////////////////////////////////////////////////////////////////////
#include "MonteCarlo_reduction.cuh"

////////////////////////////////////////////////////////////////////////////////
// Internal GPU-side data structures
////////////////////////////////////////////////////////////////////////////////
#define MAX_OPTIONS (1024*1024)

//Preprocessed input option data
typedef struct
{
    real S;
    real X;
    real MuByT;
    real VBySqrtT;
} __TOptionData;

////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut payoff functions for different precision modes
////////////////////////////////////////////////////////////////////////////////
__device__ inline float endCallValue(float S, float X, float r, float MuByT, float VBySqrtT)
{
    float callValue = S * __expf(MuByT + VBySqrtT * r) - X;
    return (callValue > 0.0F) ? callValue : 0.0F;
}

__device__ inline double endCallValue(double S, double X, double r, double MuByT, double VBySqrtT)
{
    double callValue = S * exp(MuByT + VBySqrtT * r) - X;
    return (callValue > 0.0) ? callValue : 0.0;
}

#define THREAD_N 256

////////////////////////////////////////////////////////////////////////////////
// This kernel computes the integral over all paths using a single thread block
// per option. It is fastest when the number of thread blocks times the work per
// block is high enough to keep the GPU busy.
////////////////////////////////////////////////////////////////////////////////
static __global__ void MonteCarloOneBlockPerOption(
    curandState * __restrict rngStates,
    const __TOptionData * __restrict d_OptionData,
    __TOptionValue * __restrict d_CallValue,
    int pathN,
    int optionN)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    const int SUM_N = THREAD_N;
    __shared__ real s_SumCall[SUM_N];
    __shared__ real s_Sum2Call[SUM_N];

    // determine global thread id
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Copy random number state to local memory for efficiency
    curandState localState = rngStates[tid];
    for(int optionIndex = blockIdx.x; optionIndex < optionN; optionIndex += gridDim.x)
    {
        const real        S = d_OptionData[optionIndex].S;
        const real        X = d_OptionData[optionIndex].X;
        const real    MuByT = d_OptionData[optionIndex].MuByT;
        const real VBySqrtT = d_OptionData[optionIndex].VBySqrtT;

        //Cycle through the entire samples array:
        //derive end stock price for each path
        //accumulate partial integrals into intermediate shared memory buffer
        for (int iSum = threadIdx.x; iSum < SUM_N; iSum += blockDim.x)
        {
            __TOptionValue sumCall = {0, 0};

            #pragma unroll 8
            for (int i = iSum; i < pathN; i += SUM_N)
            {
                real              r = curand_normal(&localState);
                real      callValue = endCallValue(S, X, r, MuByT, VBySqrtT);
                sumCall.Expected   += callValue;
                sumCall.Confidence += callValue * callValue;
            }

            s_SumCall[iSum]  = sumCall.Expected;
            s_Sum2Call[iSum] = sumCall.Confidence;
        }

        //Reduce shared memory accumulators
        //and write final result to global memory
        cg::sync(cta);
        sumReduce<real, SUM_N, THREAD_N>(s_SumCall, s_Sum2Call, cta, tile32, &d_CallValue[optionIndex]);
    }
}

static __global__ void rngSetupStates(
    curandState *rngState,
    int device_id)
{
    // determine global thread id
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    // Each threadblock gets different seed,
    // Threads within a threadblock get different sequence numbers
    curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0, &rngState[tid]);
}


////////////////////////////////////////////////////////////////////////////////
// Host-side interface to GPU Monte Carlo
////////////////////////////////////////////////////////////////////////////////

extern "C" void initMonteCarloGPU(TOptionPlan *plan)
{
    checkCudaErrors(cudaMalloc(&plan->d_OptionData, sizeof(__TOptionData)*(plan->optionCount)));
    checkCudaErrors(cudaMalloc(&plan->d_CallValue, sizeof(__TOptionValue)*(plan->optionCount)));
    checkCudaErrors(cudaMallocHost(&plan->h_OptionData, sizeof(__TOptionData)*(plan->optionCount)));
    //Allocate internal device memory
    checkCudaErrors(cudaMallocHost(&plan->h_CallValue, sizeof(__TOptionValue)*(plan->optionCount)));
    //Allocate states for pseudo random number generators
    checkCudaErrors(cudaMalloc((void **) &plan->rngStates,
                               plan->gridSize * THREAD_N * sizeof(curandState)));
    checkCudaErrors(cudaMemset(plan->rngStates, 0, plan->gridSize * THREAD_N * sizeof(curandState)));

    // place each device pathN random numbers apart on the random number sequence
    rngSetupStates<<<plan->gridSize, THREAD_N>>>(plan->rngStates, plan->device);
    getLastCudaError("rngSetupStates kernel failed.\n");
}

//Compute statistics and deallocate internal device memory
extern "C" void closeMonteCarloGPU(TOptionPlan *plan)
{
    for (int i = 0; i < plan->optionCount; i++)
    {
        const double    RT = plan->optionData[i].R * plan->optionData[i].T;
        const double   sum = plan->h_CallValue[i].Expected;
        const double  sum2 = plan->h_CallValue[i].Confidence;
        const double pathN = plan->pathN;
        //Derive average from the total sum and discount by riskfree rate
        plan->callValue[i].Expected = (float)(exp(-RT) * sum / pathN);
        //Standard deviation
        double stdDev = sqrt((pathN * sum2 - sum * sum)/ (pathN * (pathN - 1)));
        //Confidence width; in 95% of all cases theoretical value lies within these borders
        plan->callValue[i].Confidence = (float)(exp(-RT) * 1.96 * stdDev / sqrt(pathN));
    }

    checkCudaErrors(cudaFree(plan->rngStates));
    checkCudaErrors(cudaFreeHost(plan->h_CallValue));
    checkCudaErrors(cudaFreeHost(plan->h_OptionData));
    checkCudaErrors(cudaFree(plan->d_CallValue));
    checkCudaErrors(cudaFree(plan->d_OptionData));
}

//Main computations
extern "C" void MonteCarloGPU(TOptionPlan *plan, cudaStream_t stream)
{
    __TOptionValue *h_CallValue = plan->h_CallValue;

    if (plan->optionCount <= 0 || plan->optionCount > MAX_OPTIONS)
    {
        printf("MonteCarloGPU(): bad option count.\n");
        return;
    }

    __TOptionData * h_OptionData = (__TOptionData *)plan->h_OptionData;

    for (int i = 0; i < plan->optionCount; i++)
    {
        const double           T = plan->optionData[i].T;
        const double           R = plan->optionData[i].R;
        const double           V = plan->optionData[i].V;
        const double       MuByT = (R - 0.5 * V * V) * T;
        const double    VBySqrtT = V * sqrt(T);
        h_OptionData[i].S        = (real)plan->optionData[i].S;
        h_OptionData[i].X        = (real)plan->optionData[i].X;
        h_OptionData[i].MuByT    = (real)MuByT;
        h_OptionData[i].VBySqrtT = (real)VBySqrtT;
    }

    checkCudaErrors(cudaMemcpyAsync(
                        plan->d_OptionData,
                        h_OptionData,
                        plan->optionCount * sizeof(__TOptionData),
                        cudaMemcpyHostToDevice, stream
                    ));

    MonteCarloOneBlockPerOption<<<plan->gridSize, THREAD_N, 0, stream>>>(
        plan->rngStates,
        (__TOptionData *)(plan->d_OptionData),
        (__TOptionValue *)(plan->d_CallValue),
        plan->pathN,
        plan->optionCount
    );
    getLastCudaError("MonteCarloOneBlockPerOption() execution failed\n");


    checkCudaErrors(cudaMemcpyAsync(
                        h_CallValue,
                        plan->d_CallValue,
                        plan->optionCount * sizeof(__TOptionValue), cudaMemcpyDeviceToHost, stream
                    ));

    //cudaDeviceSynchronize();
}

/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef MONTECARLO_REDUCTION_CUH
#define MONTECARLO_REDUCTION_CUH

#include <cooperative_groups.h>

namespace cg = cooperative_groups;

////////////////////////////////////////////////////////////////////////////////
// This function calculates total sum for each of the two input arrays.
// SUM_N must be power of two
// Unrolling provides a bit of a performance improvement for small
// to medium path counts.
////////////////////////////////////////////////////////////////////////////////

template<class T, int SUM_N, int blockSize>
__device__ void sumReduce(T *sum, T *sum2, cg::thread_block &cta, cg::thread_block_tile<32> &tile32, __TOptionValue *d_CallValue)
{
    const int VEC = 32;
    const int tid = cta.thread_rank();

    T beta  = sum[tid];
    T beta2 = sum2[tid];
    T temp, temp2;

    for (int i = VEC/2; i > 0; i>>=1)
    {
        if (tile32.thread_rank() < i)
        {
                temp      = sum[tid+i];
                temp2     = sum2[tid+i];
                beta     += temp;
                beta2    += temp2;
                sum[tid]  = beta;
                sum2[tid] = beta2;
        }
        cg::sync(tile32);
    }
    cg::sync(cta);

    if (tid == 0)
    {
        beta  = 0;
        beta2 = 0;
        for (int i = 0; i < blockDim.x; i += VEC)
        {
            beta  += sum[i];
            beta2 += sum2[i];
        }
        __TOptionValue t  = {beta, beta2};
        *d_CallValue = t;
    }
    cg::sync(cta);
}


#endif
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


#ifndef QUASIRANDOMGENERATOR_KERNEL_CUH
#define QUASIRANDOMGENERATOR_KERNEL_CUH


#include <stdio.h>
#include <stdlib.h>
#include <helper_cuda.h>
#include "quasirandomGenerator_common.h"


//Fast integer multiplication
#define MUL(a, b) __umul24(a, b)


////////////////////////////////////////////////////////////////////////////////
// Niederreiter quasirandom number generation kernel
////////////////////////////////////////////////////////////////////////////////
static __constant__ unsigned int c_Table[QRNG_DIMENSIONS][QRNG_RESOLUTION];

static __global__ void quasirandomGeneratorKernel(
    float *d_Output,
    unsigned int seed,
    unsigned int N
)
{
    unsigned int *dimBase = &c_Table[threadIdx.y][0];
    unsigned int      tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x;
    unsigned int  threadN = MUL(blockDim.x, gridDim.x);

    for (unsigned int pos = tid; pos < N; pos += threadN)
    {
        unsigned int result = 0;
        unsigned int data = seed + pos;

        for (int bit = 0; bit < QRNG_RESOLUTION; bit++, data >>= 1)
            if (data & 1)
            {
                result ^= dimBase[bit];
            }

        d_Output[MUL(threadIdx.y, N) + pos] = (float)(result + 1) * INT_SCALE;
    }
}

//Table initialization routine
extern "C" void initTableGPU(unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION])
{
    checkCudaErrors(cudaMemcpyToSymbol(
                        c_Table,
                        tableCPU,
                        QRNG_DIMENSIONS * QRNG_RESOLUTION * sizeof(unsigned int)
                    ));
}

//Host-side interface
extern "C" void quasirandomGeneratorGPU(float *d_Output, unsigned int seed, unsigned int N)
{
    dim3 threads(128, QRNG_DIMENSIONS);
    quasirandomGeneratorKernel<<<128, threads>>>(d_Output, seed, N);
    getLastCudaError("quasirandomGeneratorKernel() execution failed.\n");
}


////////////////////////////////////////////////////////////////////////////////
// Moro's Inverse Cumulative Normal Distribution function approximation
////////////////////////////////////////////////////////////////////////////////
__device__ inline float MoroInvCNDgpu(unsigned int x)
{
    const float a1 = 2.50662823884f;
    const float a2 = -18.61500062529f;
    const float a3 = 41.39119773534f;
    const float a4 = -25.44106049637f;
    const float b1 = -8.4735109309f;
    const float b2 = 23.08336743743f;
    const float b3 = -21.06224101826f;
    const float b4 = 3.13082909833f;
    const float c1 = 0.337475482272615f;
    const float c2 = 0.976169019091719f;
    const float c3 = 0.160797971491821f;
    const float c4 = 2.76438810333863E-02f;
    const float c5 = 3.8405729373609E-03f;
    const float c6 = 3.951896511919E-04f;
    const float c7 = 3.21767881768E-05f;
    const float c8 = 2.888167364E-07f;
    const float c9 = 3.960315187E-07f;

    float z;

    bool negate = false;

    // Ensure the conversion to floating point will give a value in the
    // range (0,0.5] by restricting the input to the bottom half of the
    // input domain. We will later reflect the result if the input was
    // originally in the top half of the input domain
    if (x >= 0x80000000UL)
    {
        x = 0xffffffffUL - x;
        negate = true;
    }

    // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff])
    // Convert to floating point in (0,0.5]
    const float x1 = 1.0f / static_cast<float>(0xffffffffUL);
    const float x2 = x1 / 2.0f;
    float p1 = x * x1 + x2;
    // Convert to floating point in (-0.5,0]
    float p2 = p1 - 0.5f;

    // The input to the Moro inversion is p2 which is in the range
    // (-0.5,0]. This means that our output will be the negative side
    // of the bell curve (which we will reflect if "negate" is true).

    // Main body of the bell curve for |p| < 0.42
    if (p2 > -0.42f)
    {
        z = p2 * p2;
        z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0f);
    }
    // Special case (Chebychev) for tail
    else
    {
        z = __logf(-__logf(p1));
        z = - (c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z * (c8 + z * c9))))))));
    }

    // If the original input (x) was in the top half of the range, reflect
    // to get the positive side of the bell curve
    return negate ? -z : z;
}

////////////////////////////////////////////////////////////////////////////////
// Main kernel. Choose between transforming
// input sequence and uniform ascending (0, 1) sequence
////////////////////////////////////////////////////////////////////////////////
static __global__ void inverseCNDKernel(
    float *d_Output,
    unsigned int *d_Input,
    unsigned int pathN
)
{
    unsigned int distance = ((unsigned int)-1) / (pathN + 1);
    unsigned int     tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x;
    unsigned int threadN = MUL(blockDim.x, gridDim.x);

    //Transform input number sequence if it's supplied
    if (d_Input)
    {
        for (unsigned int pos = tid; pos < pathN; pos += threadN)
        {
            unsigned int d = d_Input[pos];
            d_Output[pos] = (float)MoroInvCNDgpu(d);
        }
    }
    //Else generate input uniformly placed samples on the fly
    //and write to destination
    else
    {
        for (unsigned int pos = tid; pos < pathN; pos += threadN)
        {
            unsigned int d = (pos + 1) * distance;
            d_Output[pos] = (float)MoroInvCNDgpu(d);
        }
    }
}

extern "C" void inverseCNDgpu(float *d_Output, unsigned int *d_Input, unsigned int N)
{
    inverseCNDKernel<<<128, 128>>>(d_Output, d_Input, N);
    getLastCudaError("inverseCNDKernel() execution failed.\n");
}


#endif
/*
* Portions Copyright (c) 1993-2015 NVIDIA Corporation.  All rights reserved.
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
* Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
* Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
*
* Sobol Quasi-random Number Generator example
*
* Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
* http://people.maths.ox.ac.uk/~gilesm/
*
* and C code developed by Stephen Joe, University of Waikato, New Zealand
* and Frances Kuo, University of New South Wales, Australia
* http://web.maths.unsw.edu.au/~fkuo/sobol/
*
* For theoretical background see:
*
* P. Bratley and B.L. Fox.
* Implementing Sobol's quasirandom sequence generator
* http://portal.acm.org/citation.cfm?id=42288
* ACM Trans. on Math. Software, 14(1):88-100, 1988
*
* S. Joe and F. Kuo.
* Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
* http://portal.acm.org/citation.cfm?id=641879
* ACM Trans. on Math. Software, 29(1):49-57, 2003
*
*/

#include "sobol.h"
#include "sobol_gpu.h"
#include <cooperative_groups.h>

namespace cg = cooperative_groups;
#include <helper_cuda.h>

#define k_2powneg32 2.3283064E-10F

__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ unsigned int v[n_directions];

    // Offset into the correct dimension as specified by the
    // block y coordinate
    d_directions = d_directions + n_directions * blockIdx.y;
    d_output = d_output +  n_vectors * blockIdx.y;

    // Copy the direction numbers for this dimension into shared
    // memory - there are only 32 direction numbers so only the
    // first 32 (n_directions) threads need participate.
    if (threadIdx.x < n_directions)
    {
        v[threadIdx.x] = d_directions[threadIdx.x];
    }

    cg::sync(cta);

    // Set initial index (i.e. which vector this thread is
    // computing first) and stride (i.e. step to the next vector
    // for this thread)
    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = gridDim.x * blockDim.x;

    // Get the gray code of the index
    // c.f. Numerical Recipes in C, chapter 20
    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
    unsigned int g = i0 ^ (i0 >> 1);

    // Initialisation for first point x[i0]
    // In the Bratley and Fox paper this is equation (*), where
    // we are computing the value for x[n] without knowing the
    // value of x[n-1].
    unsigned int X = 0;
    unsigned int mask;

    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
    {
        // We want X ^= g_k * v[k], where g_k is one or zero.
        // We do this by setting a mask with all bits equal to
        // g_k. In reality we keep shifting g so that g_k is the
        // LSB of g. This way we avoid multiplication.
        mask = - (g & 1);
        X ^= mask & v[k];
        g = g >> 1;
    }

    if (i0 < n_vectors)
    {
        d_output[i0] = (float)X * k_2powneg32;
    }

    // Now do rest of points, using the stride
    // Here we want to generate x[i] from x[i-stride] where we
    // don't have any of the x in between, therefore we have to
    // revisit the equation (**), this is easiest with an example
    // so assume stride is 16.
    // From x[n] to x[n+16] there will be:
    //   8 changes in the first bit
    //   4 changes in the second bit
    //   2 changes in the third bit
    //   1 change in the fourth
    //   1 change in one of the remaining bits
    //
    // What this means is that in the equation:
    //   x[n+1] = x[n] ^ v[p]
    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
    //   ...
    // We will apply xor with v[1] eight times, v[2] four times,
    // v[3] twice, v[4] once and one other direction number once.
    // Since two xors cancel out, we can skip even applications
    // and just apply xor with v[4] (i.e. log2(16)) and with
    // the current applicable direction number.
    // Note that all these indices count from 1, so we need to
    // subtract 1 from them all to account for C arrays counting
    // from zero.
    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
    unsigned int v_stridemask = stride - 1;

    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
    {
        // x[i] = x[i-stride] ^ v[b] ^ v[c]
        //  where b is log2(stride) minus 1 for C array indexing
        //  where c is the index of the rightmost zero bit in i,
        //  not including the bottom log2(stride) bits, minus 1
        //  for C array indexing
        // In the Bratley and Fox paper this is equation (**)
        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
        d_output[i] = (float)X * k_2powneg32;
    }
}

extern "C"
void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
{
    const int threadsperblock = 64;

    // Set up the execution configuration
    dim3 dimGrid;
    dim3 dimBlock;

    int            device;
    cudaDeviceProp prop;
    checkCudaErrors(cudaGetDevice(&device));
    checkCudaErrors(cudaGetDeviceProperties(&prop, device));

    // This implementation of the generator outputs all the draws for
    // one dimension in a contiguous region of memory, followed by the
    // next dimension and so on.
    // Therefore all threads within a block will be processing different
    // vectors from the same dimension. As a result we want the total
    // number of blocks to be a multiple of the number of dimensions.
    dimGrid.y = n_dimensions;

    // If the number of dimensions is large then we will set the number
    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
    // but if the number of dimensions is small (e.g. less than four per
    // multiprocessor) then we'll partition the vectors across blocks
    // (as well as threads).
    if (n_dimensions < (4 * prop.multiProcessorCount))
    {
        dimGrid.x = 4 * prop.multiProcessorCount;
    }
    else
    {
        dimGrid.x = 1;
    }

    // Cap the dimGrid.x if the number of vectors is small
    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
    {
        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
    }

    // Round up to a power of two, required for the algorithm so that
    // stride is a power of two.
    unsigned int targetDimGridX = dimGrid.x;

    for (dimGrid.x = 1 ; dimGrid.x < targetDimGridX ; dimGrid.x *= 2);

    // Fix the number of threads
    dimBlock.x = threadsperblock;

    // Execute GPU kernel
    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

////////////////////////////////////////////////////////////////////////////////
// Global types and parameters
////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#include <helper_cuda.h>
#include "binomialOptions_common.h"
#include "realtype.h"


//Preprocessed input option data
typedef struct
{
    real S;
    real X;
    real vDt;
    real puByDf;
    real pdByDf;
} __TOptionData;
static __constant__ __TOptionData d_OptionData[MAX_OPTIONS];
static __device__           real d_CallValue[MAX_OPTIONS];


////////////////////////////////////////////////////////////////////////////////
// Overloaded shortcut functions for different precision modes
////////////////////////////////////////////////////////////////////////////////
#ifndef DOUBLE_PRECISION
__device__ inline float expiryCallValue(float S, float X, float vDt, int i)
{
    float d = S * __expf(vDt * (2.0f * i - NUM_STEPS)) - X;
    return (d > 0.0F) ? d : 0.0F;
}
#else
__device__ inline double expiryCallValue(double S, double X, double vDt, int i)
{
    double d = S * exp(vDt * (2.0 * i - NUM_STEPS)) - X;
    return (d > 0.0) ? d : 0.0;
}
#endif


////////////////////////////////////////////////////////////////////////////////
// GPU kernel
////////////////////////////////////////////////////////////////////////////////
#define THREADBLOCK_SIZE 128
#define ELEMS_PER_THREAD (NUM_STEPS/THREADBLOCK_SIZE)
#if NUM_STEPS % THREADBLOCK_SIZE
#error Bad constants
#endif

__global__ void binomialOptionsKernel()
{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ real call_exchange[THREADBLOCK_SIZE + 1];

    const int     tid = threadIdx.x;
    const real      S = d_OptionData[blockIdx.x].S;
    const real      X = d_OptionData[blockIdx.x].X;
    const real    vDt = d_OptionData[blockIdx.x].vDt;
    const real puByDf = d_OptionData[blockIdx.x].puByDf;
    const real pdByDf = d_OptionData[blockIdx.x].pdByDf;

    real call[ELEMS_PER_THREAD + 1];
    #pragma unroll
    for(int i = 0; i < ELEMS_PER_THREAD; ++i)
        call[i] = expiryCallValue(S, X, vDt, tid * ELEMS_PER_THREAD + i);

    if (tid == 0)
        call_exchange[THREADBLOCK_SIZE] = expiryCallValue(S, X, vDt, NUM_STEPS);

    int final_it = max(0, tid * ELEMS_PER_THREAD - 1);

    #pragma unroll 16
    for(int i = NUM_STEPS; i > 0; --i)
    {
        call_exchange[tid] = call[0];
        cg::sync(cta);
        call[ELEMS_PER_THREAD] = call_exchange[tid + 1];
        cg::sync(cta);

        if (i > final_it)
        {
           #pragma unroll
           for(int j = 0; j < ELEMS_PER_THREAD; ++j)
              call[j] = puByDf * call[j + 1] + pdByDf * call[j];
        }
    }

    if (tid == 0)
    {
        d_CallValue[blockIdx.x] = call[0];
    }
}

////////////////////////////////////////////////////////////////////////////////
// Host-side interface to GPU binomialOptions
////////////////////////////////////////////////////////////////////////////////
extern "C" void binomialOptionsGPU(
    real *callValue,
    TOptionData  *optionData,
    int optN
)
{
    __TOptionData h_OptionData[MAX_OPTIONS];

    for (int i = 0; i < optN; i++)
    {
        const real      T = optionData[i].T;
        const real      R = optionData[i].R;
        const real      V = optionData[i].V;

        const real     dt = T / (real)NUM_STEPS;
        const real    vDt = V * sqrt(dt);
        const real    rDt = R * dt;
        //Per-step interest and discount factors
        const real     If = exp(rDt);
        const real     Df = exp(-rDt);
        //Values and pseudoprobabilities of upward and downward moves
        const real      u = exp(vDt);
        const real      d = exp(-vDt);
        const real     pu = (If - d) / (u - d);
        const real     pd = (real)1.0 - pu;
        const real puByDf = pu * Df;
        const real pdByDf = pd * Df;

        h_OptionData[i].S      = (real)optionData[i].S;
        h_OptionData[i].X      = (real)optionData[i].X;
        h_OptionData[i].vDt    = (real)vDt;
        h_OptionData[i].puByDf = (real)puByDf;
        h_OptionData[i].pdByDf = (real)pdByDf;
    }

    checkCudaErrors(cudaMemcpyToSymbol(d_OptionData, h_OptionData, optN * sizeof(__TOptionData)));
    binomialOptionsKernel<<<optN, THREADBLOCK_SIZE>>>();
    getLastCudaError("binomialOptionsKernel() execution failed.\n");
    checkCudaErrors(cudaMemcpyFromSymbol(callValue, d_CallValue, optN *sizeof(real)));
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef QUASIRANDOMGENERATOR_KERNEL_CUH
#define QUASIRANDOMGENERATOR_KERNEL_CUH

#include "quasirandomGenerator_common.h"

//Fast integer multiplication
#define MUL(a, b) __umul24(a, b)

////////////////////////////////////////////////////////////////////////////////
// Niederreiter quasirandom number generation kernel
////////////////////////////////////////////////////////////////////////////////
 __constant__ unsigned int c_Table[QRNG_DIMENSIONS][QRNG_RESOLUTION];

extern "C" __global__ void quasirandomGeneratorKernel(
    float *d_Output,
    unsigned int seed,
    unsigned int N
)
{
    unsigned int *dimBase = &c_Table[threadIdx.y][0];
    unsigned int      tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x;
    unsigned int  threadN = MUL(blockDim.x, gridDim.x);

    for (unsigned int pos = tid; pos < N; pos += threadN)
    {
        unsigned int result = 0;
        unsigned int data = seed + pos;

        for (int bit = 0; bit < QRNG_RESOLUTION; bit++, data >>= 1)
            if (data & 1)
            {
                result ^= dimBase[bit];
            }

        d_Output[MUL(threadIdx.y, N) + pos] = (float)(result + 1) * INT_SCALE;
    }
}


////////////////////////////////////////////////////////////////////////////////
// Moro's Inverse Cumulative Normal Distribution function approximation
////////////////////////////////////////////////////////////////////////////////
__device__ inline float MoroInvCNDgpu(unsigned int x)
{
    const float a1 = 2.50662823884f;
    const float a2 = -18.61500062529f;
    const float a3 = 41.39119773534f;
    const float a4 = -25.44106049637f;
    const float b1 = -8.4735109309f;
    const float b2 = 23.08336743743f;
    const float b3 = -21.06224101826f;
    const float b4 = 3.13082909833f;
    const float c1 = 0.337475482272615f;
    const float c2 = 0.976169019091719f;
    const float c3 = 0.160797971491821f;
    const float c4 = 2.76438810333863E-02f;
    const float c5 = 3.8405729373609E-03f;
    const float c6 = 3.951896511919E-04f;
    const float c7 = 3.21767881768E-05f;
    const float c8 = 2.888167364E-07f;
    const float c9 = 3.960315187E-07f;

    float z;

    bool negate = false;

    // Ensure the conversion to floating point will give a value in the
    // range (0,0.5] by restricting the input to the bottom half of the
    // input domain. We will later reflect the result if the input was
    // originally in the top half of the input domain
    if (x >= 0x80000000UL)
    {
        x = 0xffffffffUL - x;
        negate = true;
    }

    // x is now in the range [0,0x80000000) (i.e. [0,0x7fffffff])
    // Convert to floating point in (0,0.5]
    const float x1 = 1.0f / static_cast<float>(0xffffffffUL);
    const float x2 = x1 / 2.0f;
    float p1 = x * x1 + x2;
    // Convert to floating point in (-0.5,0]
    float p2 = p1 - 0.5f;

    // The input to the Moro inversion is p2 which is in the range
    // (-0.5,0]. This means that our output will be the negative side
    // of the bell curve (which we will reflect if "negate" is true).

    // Main body of the bell curve for |p| < 0.42
    if (p2 > -0.42f)
    {
        z = p2 * p2;
        z = p2 * (((a4 * z + a3) * z + a2) * z + a1) / ((((b4 * z + b3) * z + b2) * z + b1) * z + 1.0f);
    }
    // Special case (Chebychev) for tail
    else
    {
        z = __logf(-__logf(p1));
        z = - (c1 + z * (c2 + z * (c3 + z * (c4 + z * (c5 + z * (c6 + z * (c7 + z * (c8 + z * c9))))))));
    }

    // If the original input (x) was in the top half of the range, reflect
    // to get the positive side of the bell curve

    return negate ? -z : z;
}


////////////////////////////////////////////////////////////////////////////////
// Main kernel. Choose between transforming
// input sequence and uniform ascending (0, 1) sequence
////////////////////////////////////////////////////////////////////////////////

extern "C" __global__ void inverseCNDKernel(
    float *d_Output,
    unsigned int pathN
)
{
    unsigned int distance = ((unsigned int)-1) / (pathN + 1);
    unsigned int     tid = MUL(blockDim.x, blockIdx.x) + threadIdx.x;
    unsigned int threadN = MUL(blockDim.x, gridDim.x);

    //Transform input number sequence if it's supplied
    if (0) //d_Input)
    {
      /*
        for (unsigned int pos = tid; pos < pathN; pos += threadN)
        {
            unsigned int d = d_Input[pos];
            d_Output[pos] = (float)MoroInvCNDgpu(d);
        }
        */
    }
    //Else generate input uniformly placed samples on the fly
    //and write to destination
    else
    {
        for (unsigned int pos = tid; pos < pathN; pos += threadN)
        {
            unsigned int d = (pos + 1) * distance;
            d_Output[pos] = (float)MoroInvCNDgpu(d);
        }
    }
}

#endif
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef QUASIRANDOMGENERATOR_GPU_CUH
#define QUASIRANDOMGENERATOR_GPU_CUH

#include <nvrtc_helper.h>
#include "quasirandomGenerator_common.h"

//Fast integer multiplication
#define MUL(a, b) __umul24(a, b)

// Global variables for nvrtc outputs
char *ptx;
size_t ptxSize;
CUmodule module;

////////////////////////////////////////////////////////////////////////////////
// GPU code
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// Niederreiter quasirandom number generation kernel
////////////////////////////////////////////////////////////////////////////////

//Table initialization routine
void initTableGPU(unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION])
{
    CUdeviceptr c_Table;
    checkCudaErrors(cuModuleGetGlobal(&c_Table, NULL, module, "c_Table"));
    checkCudaErrors(cuMemcpyHtoD(c_Table, tableCPU, QRNG_DIMENSIONS * QRNG_RESOLUTION * sizeof(unsigned int)));
}

//Host-side interface
void quasirandomGeneratorGPU(CUdeviceptr d_Output, unsigned int seed, unsigned int N)
{
    dim3 threads(128, QRNG_DIMENSIONS);
    dim3 cudaGridSize(128, 1, 1);

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "quasirandomGeneratorKernel"));

    void *args[] = { (void *)&d_Output, (void *)&seed, (void *)&N };
    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            threads.x, threads.y, threads.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            &args[0], /* arguments */
                                            0));

    checkCudaErrors(cuCtxSynchronize());
}

void inverseCNDgpu(CUdeviceptr d_Output, unsigned int N)
{
    dim3 threads(128, 1,1);
    dim3 cudaGridSize(128, 1, 1);

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "inverseCNDKernel"));

    void *args[] = { (void *)&d_Output,  (void *)&N };
    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            threads.x, threads.y, threads.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            &args[0], /* arguments */
                                            0));

    checkCudaErrors(cuCtxSynchronize());
}

#endif

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


// System includes
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

__global__ void testKernel(int val)
{
    printf("[%d, %d]:\t\tValue is:%d\n",\
            blockIdx.y*gridDim.x+blockIdx.x,\
            threadIdx.z*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x,\
            val);
}

int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp props;

    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);

    //Get GPU information
    checkCudaErrors(cudaGetDevice(&devID));
    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
    printf("Device %d: \"%s\" with Compute %d.%d capability\n",
           devID, props.name, props.major, props.minor);

    printf("printf() is called. Output:\n\n");

    //Kernel configuration, where a two-dimensional grid and
    //three-dimensional blocks are configured.
    dim3 dimGrid(2, 2);
    dim3 dimBlock(2, 2, 2);
    testKernel<<<dimGrid, dimBlock>>>(10);
    cudaDeviceSynchronize();

    return EXIT_SUCCESS;
}

/*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Matrix multiplication: C = A * B.
 * Device code.
 */

#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_

#include <stdio.h>

#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) \
  cutilBankChecker((reinterpret_cast<float *>(&As[0][0])), (block_size * i + j))
#define BS(i, j) \
  cutilBankChecker((reinterpret_cast<float *>(&Bs[0][0])), (block_size * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif

////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
template <int block_size, typename size_type>
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
                          size_type wB) {
  // Block index
  size_type bx = blockIdx.x;
  size_type by = blockIdx.y;

  // Thread index
  size_type tx = threadIdx.x;
  size_type ty = threadIdx.y;

  // Index of the first sub-matrix of A processed by the block
  size_type aBegin = wA * block_size * by;

  // Index of the last sub-matrix of A processed by the block
  size_type aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  size_type aStep = block_size;

  // Index of the first sub-matrix of B processed by the block
  size_type bBegin = block_size * bx;

  // Step size used to iterate through the sub-matrices of B
  size_type bStep = block_size * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[block_size][block_size];

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B
    __shared__ float Bs[block_size][block_size];

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    AS(ty, tx) = A[a + wA * ty + tx];
    BS(ty, tx) = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    __syncthreads();

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll

    for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    __syncthreads();
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  size_type c = wB * block_size * by + block_size * bx;
  C[c + wB * ty + tx] = Csub;
}

// C wrappers around our template kernel
extern "C" __global__ void matrixMul_bs16_32bit(float *C, float *A, float *B,
                                                int wA, int wB) {
  matrixMul<16, int>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
                                                size_t wA, size_t wB) {
  matrixMul<16, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs32_32bit(float *C, float *A, float *B,
                                                int wA, int wB) {
  matrixMul<32, int>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
                                                size_t wA, size_t wB) {
  matrixMul<32, size_t>(C, A, B, wA, wB);
}

#endif  // #ifndef _MATRIXMUL_KERNEL_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample implements a simple task consumer using threads and streams
 * with all data in Unified Memory, and tasks consumed by both host and device
 */

// system includes
#include <cstdio>
#include <ctime>
#include <vector>
#include <algorithm>
#ifdef USE_PTHREADS
#include <pthread.h>
#else
#include <omp.h>
#endif
#include <stdlib.h>

// cuBLAS
#include <cublas_v2.h>

// utilities
#include <helper_cuda.h>

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent functions
	void srand48(long seed)
	{
		srand((unsigned int)seed);
	}
	double drand48()
	{
		return double(rand())/RAND_MAX;
	}
#endif

const char *sSDKname = "UnifiedMemoryStreams";

// simple task
template <typename T>
struct Task
{
    unsigned int size, id;
    T *data;
    T *result;
    T *vector;

    Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL) {};
    Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL)
    {
        // allocate unified memory -- the operation performed in this example will be a DGEMV
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T)*size*size));
        checkCudaErrors(cudaMallocManaged(&result, sizeof(T)*size));
        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T)*size));
        checkCudaErrors(cudaDeviceSynchronize());
    }

    ~Task()
    {
        // ensure all memory is deallocated
        checkCudaErrors(cudaDeviceSynchronize());
        checkCudaErrors(cudaFree(data));
        checkCudaErrors(cudaFree(result));
        checkCudaErrors(cudaFree(vector));
    }

    void allocate(const unsigned int s, const unsigned int unique_id)
    {
        // allocate unified memory outside of constructor
        id = unique_id;
        size = s;
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T)*size*size));
        checkCudaErrors(cudaMallocManaged(&result, sizeof(T)*size));
        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T)*size));
        checkCudaErrors(cudaDeviceSynchronize());

        // populate data with random elements
        for (int i=0; i<size*size; i++)
        {
            data[i] = drand48();
        }

        for (int i=0; i<size; i++)
        {
            result[i] = 0.;
            vector[i] = drand48();
        }
    }
};

#ifdef USE_PTHREADS
struct threadData_t
{
    int tid;
    Task<double> *TaskListPtr;
    cudaStream_t *streams;
    cublasHandle_t *handles;
    int taskSize;
};

typedef struct threadData_t threadData;
#endif


// simple host dgemv: assume data is in row-major format and square
template <typename T>
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
{
    // rows
    for (int i=0; i<n; i++)
    {
        result[i] *= beta;

        for (int j=0; j<n; j++)
        {
            result[i] += A[i*n+ j]*x[j];
        }
    }
}

// execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void* execute(void* inpArgs)
{
    threadData *dataPtr    = (threadData *) inpArgs;
    cudaStream_t *stream   = dataPtr->streams;
    cublasHandle_t *handle = dataPtr->handles;
    int tid                = dataPtr->tid;

    for (int i = 0; i < dataPtr->taskSize; i++)
    {
        Task<double>  &t           = dataPtr->TaskListPtr[i];

        if (t.size < 100)
        {
            // perform on host
            printf("Task [%d], thread [%d] executing on host (%d)\n",t.id,tid,t.size);

            // attach managed memory to a (dummy) stream to allow host access while the device is running
            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
            checkCudaErrors(cudaStreamSynchronize(stream[0]));
            // call the host operation
            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
        }
        else
        {
            // perform on device
            printf("Task [%d], thread [%d] executing on device (%d)\n",t.id,tid,t.size);
            double one = 1.0;
            double zero = 0.0;

            // attach managed memory to my stream
            checkCudaErrors(cublasSetStream(handle[tid+1], stream[tid+1]));
            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle));
            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle));
            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle));
            // call the device operation
            checkCudaErrors(cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
        }
    }

    pthread_exit(NULL);
}
#else
template <typename T>
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
    if (t.size < 100)
    {
        // perform on host
        printf("Task [%d], thread [%d] executing on host (%d)\n",t.id,tid,t.size);

        // attach managed memory to a (dummy) stream to allow host access while the device is running
        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        // call the host operation
        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
    }
    else
    {
        // perform on device
        printf("Task [%d], thread [%d] executing on device (%d)\n",t.id,tid,t.size);
        double one = 1.0;
        double zero = 0.0;

        // attach managed memory to my stream
        checkCudaErrors(cublasSetStream(handle[tid+1], stream[tid+1]));
        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.data, 0, cudaMemAttachSingle));
        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.vector, 0, cudaMemAttachSingle));
        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid+1], t.result, 0, cudaMemAttachSingle));
        // call the device operation
        checkCudaErrors(cublasDgemv(handle[tid+1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
    }
}
#endif

// populate a list of tasks with random sizes
template <typename T>
void initialise_tasks(std::vector< Task<T> > &TaskList)
{
    for (unsigned int i=0; i<TaskList.size(); i++)
    {
        // generate random size
        int size;
        size = std::max((int)(drand48()*1000.0), 64);
        TaskList[i].allocate(size, i);
    }
}

int main(int argc, char **argv)
{
    // set device
    cudaDeviceProp device_prop;
    int dev_id = findCudaDevice(argc, (const char **) argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

    if (!device_prop.managedMemory) {
        // This samples requires being run on a device that supports Unified Memory
        fprintf(stderr, "Unified Memory not supported on this device\n");

        exit(EXIT_WAIVED);
    }

    if (device_prop.computeMode == cudaComputeModeProhibited)
    {
        // This sample requires being run with a default or process exclusive mode
        fprintf(stderr, "This sample requires a device in either default or process exclusive mode\n");

        exit(EXIT_WAIVED);
    }

    // randomise task sizes
    int seed = time(NULL);
    srand48(seed);

    // set number of threads
    const int nthreads = 4;

    // number of streams = number of threads
    cudaStream_t *streams = new cudaStream_t[nthreads+1];
    cublasHandle_t *handles = new cublasHandle_t[nthreads+1];

    for (int i=0; i<nthreads+1; i++)
    {
        checkCudaErrors(cudaStreamCreate(&streams[i]));
        checkCudaErrors(cublasCreate(&handles[i]));
    }

    // create list of N tasks
    unsigned int N = 40;
    std::vector<Task<double> > TaskList(N);
    initialise_tasks(TaskList);

    printf("Executing tasks on host / device\n");

    // run through all tasks using threads and streams
#ifdef USE_PTHREADS
    pthread_t threads[nthreads];
    threadData *InputToThreads = new threadData[nthreads];

    for (int i=0; i < nthreads; i++)
    {
        checkCudaErrors(cudaSetDevice(dev_id));
        InputToThreads[i].tid         = i;
        InputToThreads[i].streams     = streams;
        InputToThreads[i].handles     = handles;

        if ((TaskList.size() / nthreads) == 0)
        {
            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
            InputToThreads[i].TaskListPtr = &TaskList[i*(TaskList.size() / nthreads)];
        }
        else
        {
            if (i == nthreads - 1)
            {
                InputToThreads[i].taskSize    = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
                InputToThreads[i].TaskListPtr = &TaskList[i*(TaskList.size() / nthreads)+ (TaskList.size() % nthreads)];
            }
            else
            {
                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
                InputToThreads[i].TaskListPtr = &TaskList[i*(TaskList.size() / nthreads)];
            }
        }

        pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
    }
    for (int i=0; i < nthreads; i++)
    {
        pthread_join(threads[i], NULL);
    }
#else
    omp_set_num_threads(nthreads);
    #pragma omp parallel for schedule(dynamic)
    for (int i=0; i<TaskList.size(); i++)
    {
        checkCudaErrors(cudaSetDevice(dev_id));
        int tid = omp_get_thread_num();
        execute(TaskList[i], handles, streams, tid);
    }
#endif

    cudaDeviceSynchronize();

    // Destroy CUDA Streams, cuBlas handles
    for (int i=0; i<nthreads+1; i++)
    {
        cudaStreamDestroy(streams[i]);
        cublasDestroy(handles[i]);
    }

    // Free TaskList
    std::vector< Task<double> >().swap(TaskList);

    printf("All Done!\n");
    exit(EXIT_SUCCESS);
}

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Multi-GPU sample using OpenMP for threading on the CPU side
 * needs a compiler that supports OpenMP 2.0
 */

#include <omp.h>
#include <stdio.h>  // stdio functions are used since C++ streams aren't necessarily thread safe
#include <helper_cuda.h>

using namespace std;

// a simple kernel that simply increments each array element by b
__global__ void kernelAddConstant(int *g_a, const int b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_a[idx] += b;
}

// a predicate that checks whether each array element is set to its index plus b
int correctResult(int *data, const int n, const int b)
{
    for (int i = 0; i < n; i++)
        if (data[i] != i + b)
            return 0;

    return 1;
}

int main(int argc, char *argv[])
{
    int num_gpus = 0;   // number of CUDA GPUs

    printf("%s Starting...\n\n", argv[0]);

    /////////////////////////////////////////////////////////////////
    // determine the number of CUDA capable GPUs
    //
    cudaGetDeviceCount(&num_gpus);

    if (num_gpus < 1)
    {
        printf("no CUDA capable devices were detected\n");
        return 1;
    }

    /////////////////////////////////////////////////////////////////
    // display CPU and GPU configuration
    //
    printf("number of host CPUs:\t%d\n", omp_get_num_procs());
    printf("number of CUDA devices:\t%d\n", num_gpus);

    for (int i = 0; i < num_gpus; i++)
    {
        cudaDeviceProp dprop;
        cudaGetDeviceProperties(&dprop, i);
        printf("   %d: %s\n", i, dprop.name);
    }

    printf("---------------------------\n");


    /////////////////////////////////////////////////////////////////
    // initialize data
    //
    unsigned int n = num_gpus * 8192;
    unsigned int nbytes = n * sizeof(int);
    int *a = 0;     // pointer to data on the CPU
    int b = 3;      // value by which the array is incremented
    a = (int *)malloc(nbytes);

    if (0 == a)
    {
        printf("couldn't allocate CPU memory\n");
        return 1;
    }

    for (unsigned int i = 0; i < n; i++)
        a[i] = i;


    ////////////////////////////////////////////////////////////////
    // run as many CPU threads as there are CUDA devices
    //   each CPU thread controls a different device, processing its
    //   portion of the data.  It's possible to use more CPU threads
    //   than there are CUDA devices, in which case several CPU
    //   threads will be allocating resources and launching kernels
    //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
    //   Recall that all variables declared inside an "omp parallel" scope are
    //   local to each CPU thread
    //
    omp_set_num_threads(num_gpus);  // create as many CPU threads as there are CUDA devices
    //omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices
    #pragma omp parallel
    {
        unsigned int cpu_thread_id = omp_get_thread_num();
        unsigned int num_cpu_threads = omp_get_num_threads();

        // set and check the CUDA device for this CPU thread
        int gpu_id = -1;
        checkCudaErrors(cudaSetDevice(cpu_thread_id % num_gpus));   // "% num_gpus" allows more CPU threads than GPU devices
        checkCudaErrors(cudaGetDevice(&gpu_id));
        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);

        int *d_a = 0;   // pointer to memory on the device associated with this CPU thread
        int *sub_a = a + cpu_thread_id * n / num_cpu_threads;   // pointer to this CPU thread's portion of data
        unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
        dim3 gpu_threads(128);  // 128 threads per block
        dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

        checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
        checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
        kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);

        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaFree(d_a));

    }
    printf("---------------------------\n");

    if (cudaSuccess != cudaGetLastError())
        printf("%s\n", cudaGetErrorString(cudaGetLastError()));


    ////////////////////////////////////////////////////////////////
    // check the result
    //
    bool bResult = correctResult(a, n, b);

    if (a)
        free(a); // free CPU memory

    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <vector>
#include <cooperative_groups.h>

namespace cg = cooperative_groups;

#define THREADS_PER_BLOCK 512
#define GRAPH_LAUNCH_ITERATIONS  3


typedef struct callBackData
{
    const char *fn_name;
    double *data;
}callBackData_t;

__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, size_t outputSize)
{
    __shared__ double tmp[THREADS_PER_BLOCK];

    cg::thread_block cta = cg::this_thread_block();
    size_t globaltid = blockIdx.x*blockDim.x + threadIdx.x;

    double temp_sum = 0.0;
    for (int i=globaltid; i < inputSize; i+=gridDim.x*blockDim.x)
    {
        temp_sum += (double) inputVec[i];
    }
    tmp[cta.thread_rank()] = temp_sum;

    cg::sync(cta);

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

    double beta  = temp_sum;
    double temp;

    for (int i = tile32.size() / 2; i > 0; i >>= 1) {
        if (tile32.thread_rank() < i) {
            temp       = tmp[cta.thread_rank() + i];
            beta       += temp;
            tmp[cta.thread_rank()] = beta;
        }
        cg::sync(tile32);
    }
    cg::sync(cta);

    if (cta.thread_rank() == 0 && blockIdx.x < outputSize) {
        beta  = 0.0;
        for (int i = 0; i < cta.size(); i += tile32.size()) {
            beta  += tmp[i];
        }
        outputVec[blockIdx.x] =  beta;
    }
}

__global__ void reduceFinal(double *inputVec, double *result, size_t inputSize)
{
    __shared__ double tmp[THREADS_PER_BLOCK];

    cg::thread_block cta = cg::this_thread_block();
    size_t globaltid = blockIdx.x*blockDim.x + threadIdx.x;

    double temp_sum = 0.0;
    for (int i=globaltid; i < inputSize; i+=gridDim.x*blockDim.x)
    {
        temp_sum += (double) inputVec[i];
    }
    tmp[cta.thread_rank()] = temp_sum;

    cg::sync(cta);

    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

   // do reduction in shared mem
    if ((blockDim.x >= 512) && (cta.thread_rank() < 256))
    {
        tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 256];
    }

    cg::sync(cta);

    if ((blockDim.x >= 256) &&(cta.thread_rank() < 128))
    {
        tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() + 128];
    }

    cg::sync(cta);

    if ((blockDim.x >= 128) && (cta.thread_rank() <  64))
    {
       tmp[cta.thread_rank()] = temp_sum = temp_sum + tmp[cta.thread_rank() +  64];
    }

    cg::sync(cta);

    if (cta.thread_rank() < 32)
    {
        // Fetch final intermediate sum from 2nd warp
        if (blockDim.x >=  64) temp_sum += tmp[cta.thread_rank() + 32];
        // Reduce final warp using shuffle
        for (int offset = tile32.size()/2; offset > 0; offset /= 2)
        {
             temp_sum += tile32.shfl_down(temp_sum, offset);
        }
    }
    // write result for this block to global mem
    if (cta.thread_rank() == 0) result[0] = temp_sum;
}

void init_input(float*a, size_t size)
{
    for (size_t i=0; i < size; i++)
        a[i] = (rand() & 0xFF) / (float)RAND_MAX;
}

void CUDART_CB myHostNodeCallback(void *data)
{
    // Check status of GPU after stream operations are done
    callBackData_t* tmp = (callBackData_t*)(data);
    //checkCudaErrors(tmp->status);

    double *result = (double*)(tmp->data);
    char *function = (char*)(tmp->fn_name);
    printf("[%s] Host callback final reduced sum = %lf\n", function, *result);
    *result = 0.0; // reset the result
}


void cudaGraphsManual(float* inputVec_h, float *inputVec_d, double *outputVec_d, double *result_d, size_t inputSize, size_t numOfBlocks)
{
    cudaStream_t streamForGraph;
    cudaGraph_t graph;
    std::vector<cudaGraphNode_t> nodeDependencies;
    cudaGraphNode_t memcpyNode, kernelNode, memsetNode;
    double result_h = 0.0;

    checkCudaErrors(cudaStreamCreate(&streamForGraph));

    cudaKernelNodeParams kernelNodeParams = {0};
    cudaMemcpy3DParms memcpyParams = {0};
    cudaMemsetParams memsetParams = {0};

    memcpyParams.srcArray = NULL;
    memcpyParams.srcPos   = make_cudaPos(0,0,0);
    memcpyParams.srcPtr   = make_cudaPitchedPtr(inputVec_h, sizeof(float)*inputSize, inputSize, 1);
    memcpyParams.dstArray = NULL;
    memcpyParams.dstPos   = make_cudaPos(0,0,0);
    memcpyParams.dstPtr   = make_cudaPitchedPtr(inputVec_d, sizeof(float)*inputSize, inputSize, 1);
    memcpyParams.extent   = make_cudaExtent(sizeof(float)*inputSize, 1, 1);
    memcpyParams.kind     = cudaMemcpyHostToDevice;

    memsetParams.dst            = (void*)outputVec_d;
    memsetParams.value          = 0;
    memsetParams.pitch          = 0;
    memsetParams.elementSize    = sizeof(float); // elementSize can be max 4 bytes
    memsetParams.width          = numOfBlocks*2;
    memsetParams.height         = 1;

    checkCudaErrors(cudaGraphCreate(&graph, 0));
    checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams));
    checkCudaErrors(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams));

    nodeDependencies.push_back(memsetNode);
    nodeDependencies.push_back(memcpyNode);

    void *kernelArgs[4] = {(void*)&inputVec_d, (void*)&outputVec_d, &inputSize, &numOfBlocks};

    kernelNodeParams.func = (void*)reduce;
    kernelNodeParams.gridDim  = dim3(numOfBlocks, 1, 1);
    kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = (void **)kernelArgs;
    kernelNodeParams.extra = NULL;

    checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams));

    nodeDependencies.clear();
    nodeDependencies.push_back(kernelNode);

    memset(&memsetParams, 0, sizeof(memsetParams));
    memsetParams.dst            = result_d;
    memsetParams.value          = 0;
    memsetParams.elementSize    = sizeof(float);
    memsetParams.width          = 2;
    memsetParams.height         = 1;
    checkCudaErrors(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams));

    nodeDependencies.push_back(memsetNode);

    memset(&kernelNodeParams, 0, sizeof(kernelNodeParams));
    kernelNodeParams.func = (void*)reduceFinal;
    kernelNodeParams.gridDim  = dim3(1, 1, 1);
    kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
    kernelNodeParams.sharedMemBytes = 0;
    void *kernelArgs2[3] =  {(void*)&outputVec_d, (void*)&result_d, &numOfBlocks};
    kernelNodeParams.kernelParams = kernelArgs2;
    kernelNodeParams.extra = NULL;

    checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams));
    nodeDependencies.clear();
    nodeDependencies.push_back(kernelNode);

    memset(&memcpyParams, 0, sizeof(memcpyParams));

    memcpyParams.srcArray = NULL;
    memcpyParams.srcPos   = make_cudaPos(0,0,0);
    memcpyParams.srcPtr   = make_cudaPitchedPtr(result_d, sizeof(double), 1, 1);
    memcpyParams.dstArray = NULL;
    memcpyParams.dstPos   = make_cudaPos(0,0,0);
    memcpyParams.dstPtr   = make_cudaPitchedPtr(&result_h, sizeof(double), 1, 1);
    memcpyParams.extent   = make_cudaExtent(sizeof(double), 1, 1);
    memcpyParams.kind     = cudaMemcpyDeviceToHost;
    checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams));
    nodeDependencies.clear();
    nodeDependencies.push_back(memcpyNode);

    cudaGraphNode_t hostNode;
    cudaHostNodeParams hostParams = {0};
    hostParams.fn = myHostNodeCallback;
    callBackData_t hostFnData;
    hostFnData.data = &result_h;
    hostFnData.fn_name = "cudaGraphsManual";
    hostParams.userData = &hostFnData;

    checkCudaErrors(cudaGraphAddHostNode(&hostNode, graph, nodeDependencies.data(), nodeDependencies.size(), &hostParams));

    cudaGraphNode_t *nodes = NULL;
    size_t numNodes = 0;
    checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes));
    printf("\nNum of nodes in the graph created manually = %zu\n", numNodes);

    cudaGraphExec_t graphExec;
    checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));

    cudaGraph_t clonedGraph;
    cudaGraphExec_t clonedGraphExec;
    checkCudaErrors(cudaGraphClone(&clonedGraph, graph));
    checkCudaErrors(cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0));

    for (int i=0; i < GRAPH_LAUNCH_ITERATIONS; i++)
    {
       checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));
    }

    checkCudaErrors(cudaStreamSynchronize(streamForGraph));

    printf("Cloned Graph Output.. \n");
    for (int i=0; i < GRAPH_LAUNCH_ITERATIONS; i++)
    {
       checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph));
    }
    checkCudaErrors(cudaStreamSynchronize(streamForGraph));

    checkCudaErrors(cudaGraphExecDestroy(graphExec));
    checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec));
    checkCudaErrors(cudaGraphDestroy(graph));
    checkCudaErrors(cudaGraphDestroy(clonedGraph));
    checkCudaErrors(cudaStreamDestroy(streamForGraph));
}

void cudaGraphsUsingStreamCapture(float* inputVec_h, float *inputVec_d, double *outputVec_d, double *result_d, size_t inputSize, size_t numOfBlocks)
{
    cudaStream_t stream1, stream2, streamForGraph;
    cudaEvent_t reduceKernelEvent;
    cudaGraph_t graph;
    double result_h = 0.0;

    checkCudaErrors(cudaStreamCreate(&stream1));
    checkCudaErrors(cudaStreamCreate(&stream2));
    checkCudaErrors(cudaStreamCreate(&streamForGraph));
    checkCudaErrors(cudaEventCreate(&reduceKernelEvent));

    checkCudaErrors(cudaStreamBeginCapture(stream1));

    checkCudaErrors(cudaMemcpyAsync(inputVec_d, inputVec_h, sizeof(float)*inputSize, cudaMemcpyDefault, stream1));
    checkCudaErrors(cudaMemsetAsync(outputVec_d, 0, sizeof(double)*numOfBlocks, stream1));
    reduce<<<numOfBlocks, THREADS_PER_BLOCK, 0, stream1>>>(inputVec_d, outputVec_d, inputSize, numOfBlocks);
    checkCudaErrors(cudaEventRecord(reduceKernelEvent, stream1));

    checkCudaErrors(cudaStreamWaitEvent(stream2, reduceKernelEvent, 0));
    checkCudaErrors(cudaMemsetAsync(result_d, 0, sizeof(double), stream2));
    reduceFinal<<<1, THREADS_PER_BLOCK, 0, stream2>>>(outputVec_d, result_d, numOfBlocks);
    checkCudaErrors(cudaMemcpyAsync(&result_h, result_d, sizeof(double), cudaMemcpyDefault, stream2));
	checkCudaErrors(cudaEventRecord(reduceKernelEvent, stream2));
    checkCudaErrors(cudaStreamWaitEvent(stream1, reduceKernelEvent, 0));

    callBackData_t hostFnData = {0};
    hostFnData.data = &result_h;
    hostFnData.fn_name = "cudaGraphsUsingStreamCapture";
    cudaHostFn_t fn = myHostNodeCallback;
    checkCudaErrors(cudaLaunchHostFunc(stream1, fn, &hostFnData));
    checkCudaErrors(cudaStreamEndCapture(stream1, &graph));

    cudaGraphNode_t *nodes = NULL;
    size_t numNodes = 0;
    checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes));
    printf("\nNum of nodes in the graph created using stream capture API = %zu\n", numNodes);

    cudaGraphExec_t graphExec;
    checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));

    cudaGraph_t clonedGraph;
    cudaGraphExec_t clonedGraphExec;
    checkCudaErrors(cudaGraphClone(&clonedGraph, graph));
    checkCudaErrors(cudaGraphInstantiate(&clonedGraphExec, clonedGraph, NULL, NULL, 0));

    for (int i=0; i < GRAPH_LAUNCH_ITERATIONS; i++)
    {
       checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));
    }

    checkCudaErrors(cudaStreamSynchronize(streamForGraph));

    printf("Cloned Graph Output.. \n");
    for (int i=0; i < GRAPH_LAUNCH_ITERATIONS; i++)
    {
       checkCudaErrors(cudaGraphLaunch(clonedGraphExec, streamForGraph));
    }

    checkCudaErrors(cudaStreamSynchronize(streamForGraph));

    checkCudaErrors(cudaGraphExecDestroy(graphExec));
    checkCudaErrors(cudaGraphExecDestroy(clonedGraphExec));
    checkCudaErrors(cudaGraphDestroy(graph));
    checkCudaErrors(cudaGraphDestroy(clonedGraph));
    checkCudaErrors(cudaStreamDestroy(stream1));
    checkCudaErrors(cudaStreamDestroy(stream2));
    checkCudaErrors(cudaStreamDestroy(streamForGraph));
}

int main(int argc, char **argv)
{
    size_t size = 1<<24;    // number of elements to reduce
    size_t maxBlocks = 512;

    // This will pick the best possible CUDA capable device
    int devID = findCudaDevice(argc, (const char **)argv);

    printf("%zu elements\n", size);
    printf("threads per block  = %d\n", THREADS_PER_BLOCK);
    printf("Graph Launch iterations = %d\n", GRAPH_LAUNCH_ITERATIONS);

    float *inputVec_d = NULL, *inputVec_h = NULL;
    double *outputVec_d = NULL, *result_d;

    inputVec_h = (float*) malloc(sizeof(float)*size);
    checkCudaErrors(cudaMalloc(&inputVec_d, sizeof(float)*size));
    checkCudaErrors(cudaMalloc(&outputVec_d, sizeof(double)*maxBlocks));
    checkCudaErrors(cudaMalloc(&result_d, sizeof(double)));

    init_input(inputVec_h, size);

    cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks);
    cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, size, maxBlocks);

    checkCudaErrors(cudaFree(inputVec_d));
    checkCudaErrors(cudaFree(outputVec_d));
    checkCudaErrors(cudaFree(result_d));
    return EXIT_SUCCESS;
}/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

/*
* This sample demonstrates how to use texture fetches from layered 2D textures in CUDA C
*
* This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array.
*/

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes CUDA
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

static const char *sSDKname = "simpleCubemapTexture";

// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.

texture<float, cudaTextureTypeCubemap> tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(float *g_odata, int width)
{
    // calculate this thread's data point
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // 0.5f offset and division are necessary to access the original data points
    // in the texture (such that bilinear interpolation will not be activated).
    // For details, see also CUDA Programming Guide, Appendix D

    float u = ((x+0.5f) / (float) width) * 2.f - 1.f;
    float v = ((y+0.5f) / (float) width) * 2.f - 1.f;

    float cx, cy, cz;

    for (unsigned int face = 0; face < 6; face ++)
    {
        //Layer 0 is positive X face
        if (face == 0)
        {
            cx = 1;
            cy = -v;
            cz = -u;
        }
        //Layer 1 is negative X face
        else if (face == 1)
        {
            cx = -1;
            cy = -v;
            cz = u;
        }
        //Layer 2 is positive Y face
        else if (face == 2)
        {
            cx = u;
            cy = 1;
            cz = v;
        }
        //Layer 3 is negative Y face
        else if (face == 3)
        {
            cx = u;
            cy = -1;
            cz = -v;
        }
        //Layer 4 is positive Z face
        else if (face == 4)
        {
            cx = u;
            cy = -v;
            cz = 1;
        }
        //Layer 4 is negative Z face
        else if (face == 5)
        {
            cx = -u;
            cy = -v;
            cz = -1;
        }

        // read from texture, do expected transformation and write to global memory
        g_odata[face*width*width + y*width + x] = -texCubemap(tex, cx, cy, cz);
    }
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);

    bool bResult = true;

    // get number of SMs on this GPU
    cudaDeviceProp deviceProps;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

    if (deviceProps.major < 2)
    {
        printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test will exit... \n", sSDKname);

        exit(EXIT_WAIVED);
    }

    // generate input data for layered texture
    unsigned int width=64, num_faces = 6, num_layers = 1;
    unsigned int cubemap_size = width * width * num_faces;
    unsigned int size = cubemap_size * num_layers * sizeof(float);
    float *h_data = (float *) malloc(size);

    for (int i = 0; i < (int)(cubemap_size * num_layers); i++)
    {
        h_data[i] = (float)i;
    }


    // this is the expected transformation of the input data (the expected output)
    float *h_data_ref = (float *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
    {
        for (int i = 0; i < (int)(cubemap_size); i++)
        {
            h_data_ref[layer*cubemap_size + i] = -h_data[layer*cubemap_size + i] + layer;
        }
    }

    // allocate device memory for result
    float *d_data = NULL;
    checkCudaErrors(cudaMalloc((void **) &d_data, size));

    // allocate array and copy image data
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray *cu_3darray;
    //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
    checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos = make_cudaPos(0,0,0);
    myparms.dstPos = make_cudaPos(0,0,0);
    myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
    myparms.dstArray = cu_3darray;
    myparms.extent = make_cudaExtent(width, width, num_faces);
    myparms.kind = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&myparms));

    // set texture parameters
    tex.addressMode[0] = cudaAddressModeWrap;
    tex.addressMode[1] = cudaAddressModeWrap;
    tex.filterMode = cudaFilterModeLinear;
    tex.normalized = true;  // access with normalized texture coordinates

    // Bind the array to the texture
    checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));

    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);

    printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
           width, num_layers, dimGrid.x, dimGrid.y);

    transformKernel<<< dimGrid, dimBlock >>>(d_data, width);  // warmup (for better timing)

    // check if kernel execution generated an error
    getLastCudaError("warmup Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    // execute the kernel
    transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width);

    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
    printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);

    // allocate mem for the result on host side
    float *h_odata = (float *) malloc(size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));

    // write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression"))
    {
        // write file for regression test
        sdkWriteFile<float>("./data/regression.dat", h_odata, width*width, 0.0f, false);
    }
    else
    {
        printf("Comparing kernel output to expected data\n");

#define MIN_EPSILON_ERROR 5e-3f
        bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
    }

    // cleanup memory
    free(h_data);
    free(h_data_ref);
    free(h_odata);

    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFreeArray(cu_3darray));

    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/

/* pitchLinearTexture
*
* This example demonstrates how to use textures bound to pitch linear memory.
* It performs a shift of matrix elements using wrap addressing mode (aka
* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
* in order to highlight the differences in using each.
*
* Textures binding to pitch linear memory is a new feature in CUDA 2.2,
* and allows use of texture features such as wrap addressing mode and
* filtering which are not possible with textures bound to regular linear memory
*/

// includes, system
#include <stdio.h>

#ifdef _WIN32
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#endif

// Includes CUDA
#include <cuda_runtime.h>

// Utilities and timing functions
#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include <helper_cuda.h>         // helper functions for CUDA error check

#define NUM_REPS 100  // number of repetitions performed
#define TILE_DIM 16   // tile/block size

const char *sSDKsample = "simplePitchLinearTexture";

////////////////////////////////////////////////////////////////////////////////
// Texture references
texture<float, 2, cudaReadModeElementType> texRefPL;
texture<float, 2, cudaReadModeElementType> texRefArray;

// Auto-Verification Code
bool bTestResult = true;

////////////////////////////////////////////////////////////////////////////////
// NB: (1) The second argument "pitch" is in elements, not bytes
//     (2) normalized coordinates are used (required for wrap address mode)
////////////////////////////////////////////////////////////////////////////////
//! Shifts matrix elements using pitch linear array
//! @param odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void shiftPitchLinear(float *odata,
                                 int pitch,
                                 int width,
                                 int height,
                                 int shiftX,
                                 int shiftY)
{
    int xid = blockIdx.x * blockDim.x + threadIdx.x;
    int yid = blockIdx.y * blockDim.y + threadIdx.y;

    odata[yid * pitch + xid] = tex2D(texRefPL,
                                     (xid + shiftX) / (float) width,
                                     (yid + shiftY) / (float) height);
}

////////////////////////////////////////////////////////////////////////////////
//! Shifts matrix elements using regular array
//! @param odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void shiftArray(float *odata,
                           int pitch,
                           int width,
                           int height,
                           int shiftX,
                           int shiftY)
{
    int xid = blockIdx.x * blockDim.x + threadIdx.x;
    int yid = blockIdx.y * blockDim.y + threadIdx.y;

    odata[yid * pitch + xid] = tex2D(texRefArray,
                                     (xid + shiftX) / (float) width,
                                     (yid + shiftY) / (float) height);
}

////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("%s starting...\n\n", sSDKsample);

    runTest(argc, argv);

    printf("%s completed, returned %s\n",
           sSDKsample,
           bTestResult ? "OK" : "ERROR!");
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    // Set array size
    const int nx = 2048;
    const int ny = 2048;

    // Setup shifts applied to x and y data
    const int x_shift = 5;
    const int y_shift = 7;

    if ((nx % TILE_DIM != 0)  || (ny % TILE_DIM != 0))
    {
        printf("nx and ny must be multiples of TILE_DIM\n");
        exit(EXIT_FAILURE);
    }

    // Setup execution configuration parameters
    dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);

    // This will pick the best possible CUDA capable device
    int devID = findCudaDevice(argc, (const char **)argv);

    // CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Host allocation and initialization
    float *h_idata = (float *) malloc(sizeof(float) * nx * ny);
    float *h_odata = (float *) malloc(sizeof(float) * nx * ny);
    float *gold = (float *) malloc(sizeof(float) * nx * ny);

    for (int i = 0; i < nx * ny; ++i)
    {
        h_idata[i] = (float) i;
    }

    // Device memory allocation
    // Pitch linear input data
    float *d_idataPL;
    size_t d_pitchBytes;

    checkCudaErrors(cudaMallocPitch((void **) &d_idataPL,
                                    &d_pitchBytes,
                                    nx * sizeof(float),
                                    ny));

    // Array input data
    cudaArray *d_idataArray;
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));

    // Pitch linear output data
    float *d_odata;
    checkCudaErrors(cudaMallocPitch((void **) &d_odata,
                                    &d_pitchBytes,
                                    nx * sizeof(float),
                                    ny));

    // Copy host data to device
    // Pitch linear
    size_t h_pitchBytes = nx * sizeof(float);

    checkCudaErrors(cudaMemcpy2D(d_idataPL,
                                 d_pitchBytes,
                                 h_idata,
                                 h_pitchBytes,
                                 nx * sizeof(float),
                                 ny,
                                 cudaMemcpyHostToDevice));

    // Array
    checkCudaErrors(cudaMemcpyToArray(d_idataArray,
                                      0,
                                      0,
                                      h_idata,
                                      nx * ny * sizeof(float),
                                      cudaMemcpyHostToDevice));

    // Bind texture to memory
    // Pitch linear
    texRefPL.normalized = 1;
    texRefPL.filterMode = cudaFilterModePoint;
    texRefPL.addressMode[0] = cudaAddressModeWrap;
    texRefPL.addressMode[1] = cudaAddressModeWrap;

    checkCudaErrors(cudaBindTexture2D(0,
                                      &texRefPL,
                                      d_idataPL,
                                      &channelDesc,
                                      nx,
                                      ny,
                                      d_pitchBytes));

    // Array
    texRefArray.normalized = 1;
    texRefArray.filterMode = cudaFilterModePoint;
    texRefArray.addressMode[0] = cudaAddressModeWrap;
    texRefArray.addressMode[1] = cudaAddressModeWrap;

    checkCudaErrors(cudaBindTextureToArray(texRefArray,
                                           d_idataArray,
                                           channelDesc));

    // Reference calculation
    for (int j = 0; j < ny; ++j)
    {
        int jshift = (j + y_shift) % ny;

        for (int i = 0; i < nx; ++i)
        {
            int ishift = (i + x_shift) % nx;
            gold[j * nx + i] = h_idata[jshift * nx + ishift];
        }
    }

    // Run ShiftPitchLinear kernel
    checkCudaErrors(cudaMemset2D(d_odata,
                                 d_pitchBytes,
                                 0,
                                 nx * sizeof(float),
                                 ny));

    checkCudaErrors(cudaEventRecord(start, 0));

    for (int i = 0; i < NUM_REPS; ++i)
    {
        shiftPitchLinear<<<dimGrid, dimBlock>>>
        (d_odata,
         (int)(d_pitchBytes / sizeof(float)),
         nx,
         ny,
         x_shift,
         y_shift);
    }

    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaEventSynchronize(stop));
    float timePL;
    checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));

    // Check results
    checkCudaErrors(cudaMemcpy2D(h_odata,
                                 h_pitchBytes,
                                 d_odata,
                                 d_pitchBytes,
                                 nx * sizeof(float),
                                 ny,
                                 cudaMemcpyDeviceToHost));

    bool res = compareData(gold, h_odata, nx*ny, 0.0f, 0.15f);

    bTestResult = true;

    if (res == false)
    {
        printf("*** shiftPitchLinear failed ***\n");
        bTestResult = false;
    }

    // Run ShiftArray kernel
    checkCudaErrors(cudaMemset2D(d_odata,
                                 d_pitchBytes,
                                 0,
                                 nx * sizeof(float),
                                 ny));
    checkCudaErrors(cudaEventRecord(start, 0));

    for (int i = 0; i < NUM_REPS; ++i)
    {
        shiftArray<<<dimGrid, dimBlock>>>
        (d_odata,
         (int)(d_pitchBytes / sizeof(float)),
         nx,
         ny,
         x_shift,
         y_shift);
    }

    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaEventSynchronize(stop));
    float timeArray;
    checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));

    // Check results
    checkCudaErrors(cudaMemcpy2D(h_odata,
                                 h_pitchBytes,
                                 d_odata,
                                 d_pitchBytes,
                                 nx * sizeof(float),
                                 ny,
                                 cudaMemcpyDeviceToHost));
    res = compareData(gold, h_odata, nx*ny, 0.0f, 0.15f);

    if (res == false)
    {
        printf("*** shiftArray failed ***\n");
        bTestResult = false;
    }

    float bandwidthPL =
        2.f * 1000.f * nx * ny * sizeof(float) /
        (1.e+9f) / (timePL / NUM_REPS);
    float bandwidthArray =
        2.f * 1000.f * nx * ny * sizeof(float) /
        (1.e+9f) / (timeArray / NUM_REPS);

    printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
           bandwidthPL, bandwidthArray);

    float fetchRatePL =
        nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
    float fetchRateArray =
        nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));

    printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
           "%.2e; for array: %.2e\n\n",
           fetchRatePL, fetchRateArray);

    // Cleanup
    free(h_idata);
    free(h_odata);
    free(gold);

    checkCudaErrors(cudaUnbindTexture(texRefPL));
    checkCudaErrors(cudaUnbindTexture(texRefArray));
    checkCudaErrors(cudaFree(d_idataPL));
    checkCudaErrors(cudaFreeArray(d_idataArray));
    checkCudaErrors(cudaFree(d_odata));

    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef _SHAREDMEM_H_
#define _SHAREDMEM_H_

//****************************************************************************
// Because dynamically sized shared memory arrays are declared "extern",
// we can't templatize them directly.  To get around this, we declare a
// simple wrapper struct that will declare the extern array with a different
// name depending on the type.  This avoids compiler errors about duplicate
// definitions.
//
// To use dynamically allocated shared memory in a templatized __global__ or
// __device__ function, just replace code like this:
//
//
//  template<class T>
//  __global__ void
//  foo( T* g_idata, T* g_odata)
//  {
//      // Shared mem size is determined by the host app at run time
//      extern __shared__  T sdata[];
//      ...
//      doStuff(sdata);
//      ...
//   }
//
//   With this
//  template<class T>
//  __global__ void
//  foo( T* g_idata, T* g_odata)
//  {
//      // Shared mem size is determined by the host app at run time
//      SharedMemory<T> smem;
//      T* sdata = smem.getPointer();
//      ...
//      doStuff(sdata);
//      ...
//   }
//****************************************************************************

// This is the un-specialized struct.  Note that we prevent instantiation of this
// struct by putting an undefined symbol in the function body so it won't compile.
template <typename T>
struct SharedMemory
{
    // Ensure that we won't compile any un-specialized types
    __device__ T *getPointer()
    {
        extern __device__ void error(void);
        error();
        return NULL;
    }
};

// Following are the specializations for the following types.
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
// One could also specialize it for user-defined types.

template <>
struct SharedMemory <int>
{
    __device__ int *getPointer()
    {
        extern __shared__ int s_int[];
        return s_int;
    }
};

template <>
struct SharedMemory <unsigned int>
{
    __device__ unsigned int *getPointer()
    {
        extern __shared__ unsigned int s_uint[];
        return s_uint;
    }
};

template <>
struct SharedMemory <char>
{
    __device__ char *getPointer()
    {
        extern __shared__ char s_char[];
        return s_char;
    }
};

template <>
struct SharedMemory <unsigned char>
{
    __device__ unsigned char *getPointer()
    {
        extern __shared__ unsigned char s_uchar[];
        return s_uchar;
    }
};

template <>
struct SharedMemory <short>
{
    __device__ short *getPointer()
    {
        extern __shared__ short s_short[];
        return s_short;
    }
};

template <>
struct SharedMemory <unsigned short>
{
    __device__ unsigned short *getPointer()
    {
        extern __shared__ unsigned short s_ushort[];
        return s_ushort;
    }
};

template <>
struct SharedMemory <long>
{
    __device__ long *getPointer()
    {
        extern __shared__ long s_long[];
        return s_long;
    }
};

template <>
struct SharedMemory <unsigned long>
{
    __device__ unsigned long *getPointer()
    {
        extern __shared__ unsigned long s_ulong[];
        return s_ulong;
    }
};

template <>
struct SharedMemory <bool>
{
    __device__ bool *getPointer()
    {
        extern __shared__ bool s_bool[];
        return s_bool;
    }
};

template <>
struct SharedMemory <float>
{
    __device__ float *getPointer()
    {
        extern __shared__ float s_float[];
        return s_float;
    }
};

template <>
struct SharedMemory <double>
{
    __device__ double *getPointer()
    {
        extern __shared__ double s_double[];
        return s_double;
    }
};


#endif //_SHAREDMEM_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* This sample is a templatized version of the template project.
* It also shows how to correctly templatize dynamically allocated shared
* memory arrays.
* Host code.
*/

// System includes
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <math.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <timer.h>

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

// includes, kernels
#include "sharedmem.cuh"

int g_TotalFailures = 0;

////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_idata  input data in global memory
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
template<class T>
__global__ void
testKernel(T *g_idata, T *g_odata)
{
    // Shared mem size is determined by the host app at run time
    SharedMemory<T> smem;
    T *sdata = smem.getPointer();

    // access thread id
    const unsigned int tid = threadIdx.x;
    // access number of threads in this block
    const unsigned int num_threads = blockDim.x;

    // read in input data from global memory
    sdata[tid] = g_idata[tid];
    __syncthreads();

    // perform some computations
    sdata[tid] = (T) num_threads * sdata[tid];
    __syncthreads();

    // write data to global memory
    g_odata[tid] = sdata[tid];
}


////////////////////////////////////////////////////////////////////////////////
// declaration, forward
template <class T>
void runTest(int argc, char **argv, int len);

template<class T>
void
computeGold(T *reference, T *idata, const unsigned int len)
{
    const T T_len = static_cast<T>(len);

    for (unsigned int i = 0; i < len; ++i)
    {
        reference[i] = idata[i] * T_len;
    }
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    printf("> runTest<float,32>\n");
    runTest<float>(argc, argv, 32);
    printf("> runTest<int,64>\n");
    runTest<int>(argc, argv, 64);

    printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);

    exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}

// To completely templatize runTest (below) with cutil, we need to use
// template specialization to wrap up CUTIL's array comparison and file writing
// functions for different types.

// Here's the generic wrapper for cutCompare*
template<class T>
class ArrayComparator
{
    public:
        bool compare(const T *reference, T *data, unsigned int len)
        {
            fprintf(stderr, "Error: no comparison function implemented for this type\n");
            return false;
        }
};

// Here's the specialization for ints:
template<>
class ArrayComparator<int>
{
    public:
        bool compare(const int *reference, int *data, unsigned int len)
        {
            return compareData(reference, data, len, 0.15f, 0.0f);
        }
};

// Here's the specialization for floats:
template<>
class ArrayComparator<float>
{
    public:
        bool compare(const float *reference, float *data, unsigned int len)
        {
            return compareData(reference, data, len, 0.15f, 0.15f);
        }
};

// Here's the generic wrapper for cutWriteFile*
template<class T>
class ArrayFileWriter
{
    public:
        bool write(const char *filename, T *data, unsigned int len, float epsilon)
        {
            fprintf(stderr, "Error: no file write function implemented for this type\n");
            return false;
        }
};

// Here's the specialization for ints:
template<>
class ArrayFileWriter<int>
{
    public:
        bool write(const char *filename, int *data, unsigned int len, float epsilon)
        {
            return sdkWriteFile(filename, data, len, epsilon, false);
        }
};

// Here's the specialization for floats:
template<>
class ArrayFileWriter<float>
{
    public:
        bool write(const char *filename, float *data, unsigned int len, float epsilon)
        {
            return sdkWriteFile(filename, data, len, epsilon, false);
        }
};


////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
template<class T>
void
runTest(int argc, char **argv, int len)
{
    int devID;
    cudaDeviceProp deviceProps;

    devID = findCudaDevice(argc, (const char **)argv);

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

    StartTimer();

    unsigned int num_threads = len;
    unsigned int mem_size = sizeof(float) * num_threads;

    // allocate host memory
    T *h_idata = (T *) malloc(mem_size);

    // initialize the memory
    for (unsigned int i = 0; i < num_threads; ++i)
    {
        h_idata[i] = (T) i;
    }

    // allocate device memory
    T *d_idata;
    checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
    // copy host memory to device
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size,
                               cudaMemcpyHostToDevice));

    // allocate device memory for result
    T *d_odata;
    checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));

    // setup execution parameters
    dim3  grid(1, 1, 1);
    dim3  threads(num_threads, 1, 1);

    // execute the kernel
    testKernel<T><<< grid, threads, mem_size >>>(d_idata, d_odata);

    // check if kernel execution generated and error
    getLastCudaError("Kernel execution failed");

    // allocate mem for the result on host side
    T *h_odata = (T *) malloc(mem_size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads,
                               cudaMemcpyDeviceToHost));

    printf("Processing time: %f (ms)\n", GetTimer());

    // compute reference solution
    T *reference = (T *) malloc(mem_size);
    computeGold<T>(reference, h_idata, num_threads);


    ArrayComparator<T> comparator;
    ArrayFileWriter<T> writer;

    // check result
    if (checkCmdLineFlag(argc, (const char **) argv, "regression"))
    {
        // write file for regression test
        writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
    }
    else
    {
        // custom output handling when no regression test running
        // in this case check if the result is equivalent to the expected solution
        bool res = comparator.compare(reference, h_odata, num_threads);
        printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
        g_TotalFailures += (1 != res);
    }

    // cleanup memory
    free(h_idata);
    free(h_odata);
    free(reference);
    checkCudaErrors(cudaFree(d_idata));
    checkCudaErrors(cudaFree(d_odata));
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 *
 *
 * This sample illustrates the usage of CUDA streams for overlapping
 * kernel execution with device/host memcopies.  The kernel is used to
 * initialize an array to a specific value, after which the array is
 * copied to the host (CPU) memory.  To increase performance, multiple
 * kernel/memcopy pairs are launched asynchronously, each pair in its
 * own stream.  Devices with Compute Capability 1.1 can overlap a kernel
 * and a memcopy as long as they are issued in different streams.  Kernels
 * are serialized.  Thus, if n pairs are launched, streamed approach
 * can reduce the memcopy cost to the (1/n)th of a single copy of the entire
 * data set.
 *
 * Additionally, this sample uses CUDA events to measure elapsed time for
 * CUDA calls.  Events are a part of CUDA API and provide a system independent
 * way to measure execution times on CUDA devices with approximately 0.5
 * microsecond precision.
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
*/

const char *sSDKsample = "simpleStreams";

const char *sEventSyncMethod[] =
{
    "cudaEventDefault",
    "cudaEventBlockingSync",
    "cudaEventDisableTiming",
    NULL
};

const char *sDeviceSyncMethod[] =
{
    "cudaDeviceScheduleAuto",
    "cudaDeviceScheduleSpin",
    "cudaDeviceScheduleYield",
    "INVALID",
    "cudaDeviceScheduleBlockingSync",
    NULL
};

// System includes
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

#ifndef WIN32
#include <sys/mman.h> // for mmap() / munmap()
#endif


// Macro to aligned up to the memory size in question
#define MEMORY_ALIGNMENT  4096
#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) )

__global__ void init_array(int *g_data, int *factor, int num_iterations)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    for (int i=0; i<num_iterations; i++)
    {
        g_data[idx] += *factor;    // non-coalesced on purpose, to burn time
    }
}

bool correct_data(int *a, const int n, const int c)
{
    for (int i = 0; i < n; i++)
    {
        if (a[i] != c)
        {
            printf("%d: %d %d\n", i, a[i], c);
            return false;
        }
    }

    return true;
}

inline void
AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
{
#if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__)
    if (bPinGenericMemory)
    {
        // allocate a generic page-aligned chunk of system memory
#ifdef WIN32
        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);
        *pp_a = (int *) VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
#else
        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);
        *pp_a = (int *) mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
#endif

        *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

        printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated system memory\n", (float)nbytes/1048576.0f);
        // pin allocate memory
        checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
    }
    else
#endif
#endif
    {
        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes/1048576.0f);
        // allocate host memory (pinned is required for achieve asynchronicity)
        checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
        *ppAligned_a = *pp_a;
    }
}

inline void
FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
{
#if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__)
    // CUDA 4.0 support pinning of generic host memory
    if (bPinGenericMemory)
    {
        // unpin and delete host memory
        checkCudaErrors(cudaHostUnregister(*ppAligned_a));
#ifdef WIN32
        VirtualFree(*pp_a, 0, MEM_RELEASE);
#else
        munmap(*pp_a, nbytes);
#endif
    }
    else
#endif
#endif
    {
        cudaFreeHost(*pp_a);
    }
}

static const char *sSyncMethod[] =
{
    "0 (Automatic Blocking)",
    "1 (Spin Blocking)",
    "2 (Yield Blocking)",
    "3 (Undefined Blocking Method)",
    "4 (Blocking Sync Event) = low CPU utilization",
    NULL
};

void printHelp()
{
    printf("Usage: %s [options below]\n", sSDKsample);
    printf("\t--sync_method=n for CPU/GPU synchronization\n");
    printf("\t             n=%s\n", sSyncMethod[0]);
    printf("\t             n=%s\n", sSyncMethod[1]);
    printf("\t             n=%s\n", sSyncMethod[2]);
    printf("\t   <Default> n=%s\n", sSyncMethod[4]);
    printf("\t--use_generic_memory (default) use generic page-aligned for system memory\n");
    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate system memory\n");
}

#if defined(__APPLE__) || defined(MACOSX)
#define DEFAULT_PINNED_GENERIC_MEMORY false
#else
#define DEFAULT_PINNED_GENERIC_MEMORY true
#endif

int main(int argc, char **argv)
{
    int cuda_device = 0;
    int nstreams = 4;               // number of streams for CUDA calls
    int nreps = 10;                 // number of times each experiment is repeated
    int n = 16 * 1024 * 1024;       // number of ints in the data set
    int nbytes = n * sizeof(int);   // number of data bytes
    dim3 threads, blocks;           // kernel launch configuration
    float elapsed_time, time_memcpy, time_kernel;   // timing variables
    float scale_factor = 1.0f;

    // allocate generic memory and pin it laster instead of using cudaHostAlloc()

    bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
    int  device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync

    int niterations;    // number of iterations for the loop inside the kernel

    printf("[ %s ]\n\n", sSDKsample);

    if (checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printHelp();
        return EXIT_SUCCESS;
    }

    if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0)
    {
        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4)
        {
            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
            printf("Setting reps to 100 to demonstrate steady state\n");
            nreps = 100;
        }
        else
        {
            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
            return EXIT_FAILURE;
        }
    }
    else
    {
        printHelp();
        return EXIT_SUCCESS;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory"))
    {
#if defined(__APPLE__) || defined(MACOSX)
        bPinGenericMemory = false;  // Generic Pinning of System Paged memory not currently supported on Mac OSX
#else
        bPinGenericMemory = true;
#endif
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host"))
    {
        bPinGenericMemory = false;
    }

    printf("\n> ");
    cuda_device = findCudaDevice(argc, (const char **)argv);

    // check the compute capability of the device
    int num_devices=0;
    checkCudaErrors(cudaGetDeviceCount(&num_devices));

    if (0==num_devices)
    {
        printf("your system does not have a CUDA capable device, waiving test...\n");
        return EXIT_WAIVED;
    }

    // check if the command-line chosen device ID is within range, exit if not
    if (cuda_device >= num_devices)
    {
        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices-1);
        return EXIT_FAILURE;
    }

    checkCudaErrors(cudaSetDevice(cuda_device));

    // Checking for compute capabilities
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

    niterations = 5;

    // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
    if (bPinGenericMemory)
    {
        printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");

        if (deviceProp.canMapHostMemory == 0)
        {
            printf("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
            bPinGenericMemory = false;
        }
    }

    // Anything that is less than 32 Cores will have scaled down workload
    scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
    n = (int)rint((float)n / scale_factor);

    printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

    printf("> scale_factor = %1.4f\n", 1.0f/scale_factor);
    printf("> array_size   = %d\n\n", n);

    // enable use of blocking sync, to reduce CPU usage
    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
    checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));

    // allocate host memory
    int c = 5;                      // value to which the array will be initialized
    int *h_a = 0;                   // pointer to the array data in host memory
    int *hAligned_a = 0;           // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

    // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if using the new CUDA 4.0 features
    AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

    // allocate device memory
    int *d_a = 0, *d_c = 0;             // pointers to data and init value in the device memory
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
    checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
    checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));

    printf("\nStarting Test\n");

    // allocate and initialize an array of stream handles
    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

    for (int i = 0; i < nstreams; i++)
    {
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
    }

    // create CUDA event handles
    // use blocking sync
    cudaEvent_t start_event, stop_event;
    int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync: cudaEventDefault);

    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));

    // time memcopy from device
    checkCudaErrors(cudaEventRecord(start_event, 0));     // record in stream-0, to ensure that all previous CUDA calls have completed
    checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));   // block until the event is actually recorded
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("memcopy:\t%.2f\n", time_memcpy);

    // time kernel
    threads=dim3(512, 1);
    blocks=dim3(n / threads.x, 1);
    checkCudaErrors(cudaEventRecord(start_event, 0));
    init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
    printf("kernel:\t\t%.2f\n", time_kernel);

    //////////////////////////////////////////////////////////////////////
    // time non-streamed execution for reference
    threads=dim3(512, 1);
    blocks=dim3(n / threads.x, 1);
    checkCudaErrors(cudaEventRecord(start_event, 0));

    for (int k = 0; k < nreps; k++)
    {
        init_array<<<blocks, threads>>>(d_a, d_c, niterations);
        checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
    }

    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
    printf("non-streamed:\t%.2f\n", elapsed_time / nreps);

    //////////////////////////////////////////////////////////////////////
    // time execution with nstreams streams
    threads=dim3(512,1);
    blocks=dim3(n/(nstreams*threads.x),1);
    memset(hAligned_a, 255, nbytes);     // set host memory bits to all 1s, for testing correctness
    checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
    checkCudaErrors(cudaEventRecord(start_event, 0));

    for (int k = 0; k < nreps; k++)
    {
        // asynchronously launch nstreams kernels, each operating on its own portion of data
        for (int i = 0; i < nstreams; i++)
        {
            init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i *n / nstreams, d_c, niterations);
        }

        // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
        //   commence executing when all previous CUDA calls in stream x have completed
        for (int i = 0; i < nstreams; i++)
        {
            checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams, d_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]));
        }
    }

    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
    printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);

    // check whether the output is correct
    printf("-------------------------------\n");
    bool bResults = correct_data(hAligned_a, n, c*nreps*niterations);

    // release resources
    for (int i = 0; i < nstreams; i++)
    {
        checkCudaErrors(cudaStreamDestroy(streams[i]));
    }

    checkCudaErrors(cudaEventDestroy(start_event));
    checkCudaErrors(cudaEventDestroy(stop_event));

    // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
    FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

    checkCudaErrors(cudaFree(d_a));
    checkCudaErrors(cudaFree(d_c));

    return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two memcopies
 * with kernel execution. This sample illustrates the usage of CUDA streams to
 * achieve overlapping of kernel execution with copying data to and from the device.
 *
 * Additionally, this sample uses CUDA events to measure elapsed time for
 * CUDA calls.  Events are a part of CUDA API and provide a system independent
 * way to measure execution times on CUDA devices with approximately 0.5
 * microsecond precision.
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
*/

const char *sSDKname = "simpleMultiCopy";

// includes, system
#include <stdio.h>

// include CUDA
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

// includes, kernels
// Declare the CUDA kernels here and main() code that is needed to launch
// Compute workload on the system
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < N)
    {
        for (int i=0; i<inner_reps; ++i)
        {
            g_out[idx] = g_in[idx] + 1;
        }
    }
}

#define STREAM_COUNT 4

// Uncomment to simulate data source/sink IO times
//#define SIMULATE_IO

int *h_data_source;
int *h_data_sink;

int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];

int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];


cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];

cudaEvent_t start, stop;

int N = 1 << 22;
int nreps = 10;                 // number of times each experiment is repeated
int inner_reps = 5;

int memsize;

dim3 block(512);
dim3 grid;

int thread_blocks;

float processWithStreams(int streams_used);
void init();
bool test();

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
    int cuda_device = 0;
    float scale_factor;
    cudaDeviceProp deviceProp;

    printf("[%s] - Starting...\n", sSDKname);

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");

        if (cuda_device < 0)
        {
            printf("Invalid command line parameters\n");
            exit(EXIT_FAILURE);
        }
        else
        {
            printf("cuda_device = %d\n", cuda_device);
            cuda_device = gpuDeviceInit(cuda_device);

            if (cuda_device < 0)
            {
                printf("No CUDA Capable devices found, exiting...\n");
                exit(EXIT_SUCCESS);
            }
        }
    }
    else
    {
        // Otherwise pick the device with the highest Gflops/s
        cuda_device = gpuGetMaxGflopsDeviceId();
        checkCudaErrors(cudaSetDevice(cuda_device));
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
        printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
    }

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
           deviceProp.name, deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

    // Anything that is less than 32 Cores will have scaled down workload
    scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
    N = (int)((float)N / scale_factor);

    printf("> Device name: %s\n", deviceProp.name);
    printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor,
           deviceProp.multiProcessorCount);
    printf("> scale_factor = %.2f\n", 1.0f/scale_factor);
    printf("> array_size   = %d\n\n", N);

    memsize = N * sizeof(int);

    thread_blocks = N / block.x;

    grid.x = thread_blocks % 65535;
    grid.y = (thread_blocks / 65535 + 1);


    // Allocate resources

    h_data_source = (int *) malloc(memsize);
    h_data_sink = (int *) malloc(memsize);

    for (int i =0; i<STREAM_COUNT; ++i)
    {

        checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize,
                                      cudaHostAllocDefault));
        checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
        checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));

        checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize,
                                      cudaHostAllocDefault));
        checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));

        checkCudaErrors(cudaStreamCreate(&stream[i]));
        checkCudaErrors(cudaEventCreate(&cycleDone[i]));

        cudaEventRecord(cycleDone[i], stream[i]);
    }

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    init();

    // Kernel warmup
    incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);


    // Time copies and kernel
    cudaEventRecord(start,0);
    checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
                                    cudaMemcpyHostToDevice,0));
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);

    float memcpy_h2d_time;
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

    cudaEventRecord(start,0);
    checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
                                    cudaMemcpyDeviceToHost, 0));
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);

    float memcpy_d2h_time;
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);

    cudaEventRecord(start,0);
    incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);

    float kernel_time;
    cudaEventElapsedTime(&kernel_time, start, stop);

    printf("\n");
    printf("Relevant properties of this CUDA device\n");
    printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution (device property \"deviceOverlap\")\n", deviceProp.deviceOverlap ? "X" : " ");
    //printf("(%s) Can execute several GPU kernels simultaneously (compute capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
    printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
           "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro 4000/5000/6000/K5000)\n",
           (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)
           ? "X" : " ");

    printf("\n");
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n",
           memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time);
    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n",
           memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);
    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n",
           kernel_time, (inner_reps *memsize * 2e-6)/ kernel_time);

    printf("\n");
    printf("Theoretical limits for speedup gained from overlapped data transfers:\n");
    printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
           memcpy_h2d_time + memcpy_d2h_time + kernel_time);
    printf("Compute can overlap with one transfer: %f ms\n",
           max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
    printf("Compute can overlap with both data transfers: %f ms\n",
           max(max(memcpy_h2d_time,memcpy_d2h_time), kernel_time));

    // Process pipelined work
    float serial_time = processWithStreams(1);
    float overlap_time = processWithStreams(STREAM_COUNT);

    printf("\nAverage measured timings over %d repetitions:\n", nreps);
    printf(" Avg. time when execution fully serialized\t: %f ms\n",
           serial_time / nreps);
    printf(" Avg. time when overlapped using %d streams\t: %f ms\n",
           STREAM_COUNT, overlap_time / nreps);
    printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
           (serial_time - overlap_time) / nreps);

    printf("\nMeasured throughput:\n");
    printf(" Fully serialized execution\t\t: %f GB/s\n",
           (nreps * (memsize * 2e-6))/ serial_time);
    printf(" Overlapped using %d streams\t\t: %f GB/s\n",
           STREAM_COUNT, (nreps * (memsize * 2e-6))/ overlap_time);

    // Verify the results, we will use the results for final output
    bool bResults = test();

    // Free resources

    free(h_data_source);
    free(h_data_sink);

    for (int i =0; i<STREAM_COUNT; ++i)
    {

        cudaFreeHost(h_data_in[i]);
        cudaFree(d_data_in[i]);

        cudaFreeHost(h_data_out[i]);
        cudaFree(d_data_out[i]);

        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Test result
    exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

float processWithStreams(int streams_used)
{

    int current_stream = 0;

    float time;

    // Do processing in a loop
    //
    // Note: All memory commands are processed in the order  they are issued,
    // independent of the stream they are enqueued in. Hence the pattern by
    // which the copy and kernel commands are enqueued in the stream
    // has an influence on the achieved overlap.

    cudaEventRecord(start, 0);

    for (int i=0; i<nreps; ++i)
    {
        int next_stream = (current_stream + 1) % streams_used;

#ifdef SIMULATE_IO
        // Store the result
        memcpy(h_data_sink, h_data_out[current_stream],memsize);

        // Read new input
        memcpy(h_data_in[next_stream], h_data_source, memsize);
#endif

        // Ensure that processing and copying of the last cycle has finished
        cudaEventSynchronize(cycleDone[next_stream]);

        // Process current frame
        incKernel<<<grid, block, 0, stream[current_stream]>>>(
            d_data_out[current_stream],
            d_data_in[current_stream],
            N,
            inner_reps);

        // Upload next frame
        checkCudaErrors(cudaMemcpyAsync(
                            d_data_in[next_stream],
                            h_data_in[next_stream],
                            memsize,
                            cudaMemcpyHostToDevice,
                            stream[next_stream]));

        // Download current frame
        checkCudaErrors(cudaMemcpyAsync(
                            h_data_out[current_stream],
                            d_data_out[current_stream],
                            memsize,
                            cudaMemcpyDeviceToHost,
                            stream[current_stream]));

        checkCudaErrors(cudaEventRecord(
                            cycleDone[current_stream],
                            stream[current_stream]));

        current_stream = next_stream;
    }

    cudaEventRecord(stop, 0);

    cudaDeviceSynchronize();

    cudaEventElapsedTime(&time, start, stop);

    return time;

}

void init()
{
    for (int i=0; i<N; ++i)
    {
        h_data_source[i] = 0;
    }

    for (int i =0; i<STREAM_COUNT; ++i)
    {
        memcpy(h_data_in[i], h_data_source, memsize);
    }
}


bool test()
{

    bool passed = true;

    for (int j =0; j<STREAM_COUNT; ++j)
    {
        for (int i =0; i<N; ++i)
        {
            passed &= (h_data_out[j][i] == 1);
        }
    }

    return passed;
}
////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

/* Template project which demonstrates the basics on how to setup a project
* example application.
* Host code.
*/

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes CUDA
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);

extern "C"
void computeGold(float *reference, float *idata, const unsigned int len);

////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_idata  input data in global memory
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
testKernel(float *g_idata, float *g_odata)
{
    // shared memory
    // the size is determined by the host application
    extern  __shared__  float sdata[];

    // access thread id
    const unsigned int tid = threadIdx.x;
    // access number of threads in this block
    const unsigned int num_threads = blockDim.x;

    // read in input data from global memory
    sdata[tid] = g_idata[tid];
    __syncthreads();

    // perform some computations
    sdata[tid] = (float) num_threads * sdata[tid];
    __syncthreads();

    // write data to global memory
    g_odata[tid] = sdata[tid];
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    runTest(argc, argv);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char **argv)
{
    bool bTestResult = true;

    printf("%s Starting...\n\n", argv[0]);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);

    StopWatchInterface *timer = 0;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    unsigned int num_threads = 32;
    unsigned int mem_size = sizeof(float) * num_threads;

    // allocate host memory
    float *h_idata = (float *) malloc(mem_size);

    // initalize the memory
    for (unsigned int i = 0; i < num_threads; ++i)
    {
        h_idata[i] = (float) i;
    }

    // allocate device memory
    float *d_idata;
    checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
    // copy host memory to device
    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size,
                               cudaMemcpyHostToDevice));

    // allocate device memory for result
    float *d_odata;
    checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));

    // setup execution parameters
    dim3  grid(1, 1, 1);
    dim3  threads(num_threads, 1, 1);

    // execute the kernel
    testKernel<<< grid, threads, mem_size >>>(d_idata, d_odata);

    // check if kernel execution generated and error
    getLastCudaError("Kernel execution failed");

    // allocate mem for the result on host side
    float *h_odata = (float *) malloc(mem_size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
                               cudaMemcpyDeviceToHost));

    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);

    // compute reference solution
    float *reference = (float *) malloc(mem_size);
    computeGold(reference, h_idata, num_threads);

    // check result
    if (checkCmdLineFlag(argc, (const char **) argv, "regression"))
    {
        // write file for regression test
        sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
    }
    else
    {
        // custom output handling when no regression test running
        // in this case check if the result is equivalent to the expected solution
        bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
    }

    // cleanup memory
    free(h_idata);
    free(h_odata);
    free(reference);
    checkCudaErrors(cudaFree(d_idata));
    checkCudaErrors(cudaFree(d_odata));

    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */


/* Simple example demonstrating how to use MPI with CUDA
*
*  Generate some random numbers on one node.
*  Dispatch them to all nodes.
*  Compute their square root on each node's GPU.
*  Compute the average of the results using MPI.
*
*  simpleMPI.cu: GPU part, compiled with nvcc
*/

#include <iostream>
using std::cerr;
using std::endl;

#include "simpleMPI.h"

// Error handling macro
#define CUDA_CHECK(call) \
    if((call) != cudaSuccess) { \
        cudaError_t err = cudaGetLastError(); \
        cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
        my_abort(err); }


// Device code
// Very simple GPU Kernel that computes square roots of input numbers
__global__ void simpleMPIKernel(float *input, float *output)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    output[tid] = sqrt(input[tid]);
}


// Initialize an array with random data (between 0 and 1)
void initData(float *data, int dataSize)
{
    for (int i = 0; i < dataSize; i++)
    {
        data[i] = (float)rand() / RAND_MAX;
    }
}

// CUDA computation on each node
// No MPI here, only CUDA
void computeGPU(float *hostData, int blockSize, int gridSize)
{
    int dataSize = blockSize * gridSize;

    // Allocate data on GPU memory
    float *deviceInputData = NULL;
    CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));

    float *deviceOutputData = NULL;
    CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));

    // Copy to GPU memory
    CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));

    // Run kernel
    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);

    // Copy data back to CPU memory
    CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost));

    // Free GPU memory
    CUDA_CHECK(cudaFree(deviceInputData));
    CUDA_CHECK(cudaFree(deviceOutputData));
}

float sum(float *data, int size)
{
    float accum = 0.f;

    for (int i = 0; i < size; i++)
    {
        accum += data[i];
    }

    return accum;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef _SIMPLETEXTURE_KERNEL_H_
#define _SIMPLETEXTURE_KERNEL_H_

// declare texture reference for 2D float texture
texture<float, 2, cudaReadModeElementType> tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
extern "C"
__global__ void
transformKernel(float *g_odata, int width, int height, float theta)
{
    // calculate normalized texture coordinates
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    float u = (float)x - (float)width/2;
    float v = (float)y - (float)height/2;
    float tu = u*cosf(theta) - v*sinf(theta);
    float tv = v*cosf(theta) + u*sinf(theta);

    tu /= (float)width;
    tv /= (float)height;

    // read from texture and write to global memory
    g_odata[y*width + x] = tex2D(tex, tu+0.5f, tv+0.5f);
}

#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

__device__ float multiplyByTwo(float number)
{
    return number * 2.0f;
}

__device__ float divideByTwo(float number)
{
    return number * 0.5f;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef SIMPLE_DEVICE_LIBRARY_CUH
#define SIMPLE_DEVICE_LIBRARY_CUH

extern __device__ float multiplyByTwo(float number);

extern __device__ float divideByTwo(float number);

#endif /* SIMPLE_DEVICE_LIBRARY_CUH */

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// System includes.
#include <stdio.h>
#include <iostream>

// STL.
#include <vector>

// CUDA runtime.
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA.
#include <helper_functions.h>
#include <helper_cuda.h>

// Device library includes.
#include "simpleDeviceLibrary.cuh"

using std::cout;
using std::endl;

using std::vector;

#define EPS 1e-5

typedef unsigned int uint;
typedef float(*deviceFunc)(float);

const char *sampleName = "simpleSeparateCompilation";

////////////////////////////////////////////////////////////////////////////////
// Auto-Verification Code
bool testResult = true;

////////////////////////////////////////////////////////////////////////////////
// Static device pointers to __device__ functions.
__device__ deviceFunc dMultiplyByTwoPtr = multiplyByTwo;
__device__ deviceFunc dDivideByTwoPtr = divideByTwo;

////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Transforms vector.
//! Applies the __device__ function "f" to each element of the vector "v".
////////////////////////////////////////////////////////////////////////////////
__global__ void transformVector(float *v, deviceFunc f, uint size)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
    {
        v[tid] = (*f)(v[tid]);
    }
}

////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, const char **argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cout << sampleName << " starting..." << endl;

    runTest(argc, (const char **)argv);

    cout << sampleName << " completed, returned "
         << (testResult ? "OK" : "ERROR") << endl;

    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}


void runTest(int argc, const char **argv)
{
    try
    {
        int devID;

        //cudaError_t error;

        // This will pick the best possible CUDA capable device.
        devID = findCudaDevice(argc, (const char **) argv);

        // Create host vector.
        const uint kVectorSize = 1000;

        vector<float> hVector(kVectorSize);

        for (uint i = 0; i < kVectorSize; ++i)
        {
            hVector[i] = rand() / static_cast<float>(RAND_MAX);
        }

        // Create and populate device vector.
        float *dVector;
        checkCudaErrors(cudaMalloc(&dVector, kVectorSize * sizeof(float)));

        checkCudaErrors(cudaMemcpy(dVector,
                                   &hVector[0],
                                   kVectorSize * sizeof(float),
                                   cudaMemcpyHostToDevice));

        // Kernel configuration, where a one-dimensional
        // grid and one-dimensional blocks are configured.
        const int nThreads = 1024;
        const int nBlocks = 1;

        dim3 dimGrid(nBlocks);
        dim3 dimBlock(nThreads);

        // Test library functions.
        deviceFunc hFunctionPtr;

        cudaMemcpyFromSymbol(&hFunctionPtr,
                             dMultiplyByTwoPtr,
                             sizeof(deviceFunc));
        transformVector<<<dimGrid, dimBlock>>>
        (dVector, hFunctionPtr, kVectorSize);
        checkCudaErrors(cudaGetLastError());

        cudaMemcpyFromSymbol(&hFunctionPtr,
                             dDivideByTwoPtr,
                             sizeof(deviceFunc));
        transformVector<<<dimGrid, dimBlock>>>
        (dVector, hFunctionPtr, kVectorSize);
        checkCudaErrors(cudaGetLastError());

        // Download results.
        vector<float> hResultVector(kVectorSize);

        checkCudaErrors(cudaMemcpy(&hResultVector[0],
                                   dVector,
                                   kVectorSize *sizeof(float),
                                   cudaMemcpyDeviceToHost));

        // Check results.
        for (int i = 0; i < kVectorSize; ++i)
        {
            if (fabs(hVector[i] - hResultVector[i]) > EPS)
            {
                cout << "Computations were incorrect..." << endl;
                testResult = false;
                break;
            }
        }

        // Free resources.
        if (dVector) checkCudaErrors(cudaFree(dVector));
    }
    catch (...)
    {
        cout << "Error occured, exiting..." << endl;

        exit(EXIT_FAILURE);
    }
}

////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

//
// This sample illustrates the usage of CUDA events for both GPU timing and
// overlapping CPU and GPU execution.  Events are inserted into a stream
// of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can
// perform computations while GPU is executing (including DMA memcopies
// between the host and device).  CPU can query CUDA events to determine
// whether GPU has completed tasks.
//

// includes, system
#include <stdio.h>

// includes CUDA Runtime
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper utility functions

__global__ void increment_kernel(int *g_data, int inc_value)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_data[idx] = g_data[idx] + inc_value;
}

bool correct_output(int *data, const int n, const int x)
{
    for (int i = 0; i < n; i++)
        if (data[i] != x)
        {
            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
            return false;
        }

    return true;
}

int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...\n", argv[0]);

    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);

    // get device name
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s]\n", deviceProps.name);

    int n = 16 * 1024 * 1024;
    int nbytes = n * sizeof(int);
    int value = 26;

    // allocate host memory
    int *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
    memset(a, 0, nbytes);

    // allocate device memory
    int *d_a=0;
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));

    // set kernel launch configuration
    dim3 threads = dim3(512, 1);
    dim3 blocks  = dim3(n / threads.x, 1);

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    checkCudaErrors(cudaDeviceSynchronize());
    float gpu_time = 0.0f;

    // asynchronously issue work to the GPU (all to stream 0)
    sdkStartTimer(&timer);
    cudaEventRecord(start, 0);
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
    cudaEventRecord(stop, 0);
    sdkStopTimer(&timer);

    // have CPU do some work while waiting for stage 1 to finish
    unsigned long int counter=0;

    while (cudaEventQuery(stop) == cudaErrorNotReady)
    {
        counter++;
    }

    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));

    // print the cpu and gpu times
    printf("time spent executing by the GPU: %.2f\n", gpu_time);
    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);

    // check the output for correctness
    bool bFinalResults = correct_output(a, n, value);

    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaFreeHost(a));
    checkCudaErrors(cudaFree(d_a));

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <helper_cuda.h>
#include <helper_string.h>

////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
__device__ int g_uids = 0;

////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
__device__ void print_info(int depth, int thread, int uid, int parent_uid)
{
    if (threadIdx.x == 0)
    {
        if (depth == 0)
            printf("BLOCK %d launched by the host\n", uid);
        else
        {
            char buffer[32];

            for (int i = 0 ; i < depth ; ++i)
            {
                buffer[3*i+0] = '|';
                buffer[3*i+1] = ' ';
                buffer[3*i+2] = ' ';
            }

            buffer[3*depth] = '\0';
            printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
        }
    }

    __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the 'max_depth' has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
    // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
    __shared__ int s_uid;

    if (threadIdx.x == 0)
    {
        s_uid = atomicAdd(&g_uids, 1);
    }

    __syncthreads();

    // We print the ID of the block and information about its parent.
    print_info(depth, thread, s_uid, parent_uid);

    // We launch new blocks if we haven't reached the max_depth yet.
    if (++depth >= max_depth)
    {
        return;
    }

    cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}

////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("starting Simple Print (CUDA Dynamic Parallelism)\n");

    // Parse a few command-line arguments.
    int max_depth = 2;

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
        checkCmdLineFlag(argc, (const char **)argv, "h"))
    {
        printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]);
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "depth"))
    {
        max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");

        if (max_depth < 1 || max_depth > 8)
        {
            printf("depth parameter has to be between 1 and 8\n");
            exit(EXIT_FAILURE);
        }
    }

    // Find/set the device.
    int device = -1;
    cudaDeviceProp deviceProp;
    device = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));

    if (!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >= 5)))
    {
        printf("GPU %d - %s  does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name);
        exit(EXIT_WAIVED);
    }

    // Print a message describing what the sample does.
    printf("***************************************************************************\n");
    printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
    printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n");
    printf("until it reaches max_depth=%d\n\n", max_depth);
    printf("In total 2");
    int num_blocks = 2, sum = 2;

    for (int i = 1 ; i < max_depth ; ++i)
    {
        num_blocks *= 4;
        printf("+%d", num_blocks);
        sum += num_blocks;
    }

    printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2);
    printf("***************************************************************************\n\n");

    // We set the recursion limit for CDP to max_depth.
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);

    // Launch the kernel from the CPU.
    printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
    cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
    checkCudaErrors(cudaGetLastError());

    // Finalize.
    checkCudaErrors(cudaDeviceSynchronize());

    exit(EXIT_SUCCESS);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This sample implements multi-threaded heterogeneous computing workloads with the new CPU callbacks for CUDA streams and events introduced with CUDA 5.0.
 * Together with the thread safety of the CUDA API implementing heterogeneous workloads that float between CPU threads and GPUs has become simple and efficient.
 *
 * The workloads in the sample follow the form CPU preprocess -> GPU process -> CPU postprocess.
 * Each CPU processing step is handled by its own dedicated thread. GPU workloads are sent to all available GPUs in the system.
 *
 */

// System includes
#include <stdio.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>

#include "multithreading.h"

const int N_workloads  = 8;
const int N_elements_per_workload = 100000;

CUTBarrier thread_barrier;

void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);

struct heterogeneous_workload
{
    int id;
    int cudaDeviceID;

    int *h_data;
    int *d_data;
    cudaStream_t stream;

    bool success;
};

__global__
void incKernel(int *data, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N)
        data[i]++;
}

CUT_THREADPROC launch(void *void_arg)
{

    heterogeneous_workload *workload = (heterogeneous_workload *) void_arg;

    // Select GPU for this CPU thread
    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));

    // Allocate Resources
    checkCudaErrors(cudaStreamCreate(&workload->stream));
    checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
    checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));

    // CPU thread generates data
    for (int i=0; i < N_elements_per_workload; ++i)
    {
        workload->h_data[i] = workload->id + i;
    }

    // Schedule work for GPU in CUDA stream without blocking the CPU thread
    // Note: Dedicated streams enable concurrent execution of workloads on the GPU
    dim3 block(512);
    dim3 grid((N_elements_per_workload + block.x-1) / block.x);

    checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data, N_elements_per_workload * sizeof(int), cudaMemcpyHostToDevice, workload->stream));
    incKernel<<<grid, block,0,workload->stream>>>(workload->d_data, N_elements_per_workload);
    checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data, N_elements_per_workload * sizeof(int), cudaMemcpyDeviceToHost, workload->stream));

    // New in CUDA 5.0: Add a CPU callback which is called once all currently pending operations in the CUDA stream have finished
    checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));

    CUT_THREADEND;
    // CPU thread end of life, GPU continues to process data...
}

CUT_THREADPROC postprocess(void *void_arg)
{
    heterogeneous_workload *workload = (heterogeneous_workload *) void_arg;
    // ... GPU is done with processing, continue on new CPU thread...

    // Select GPU for this CPU thread
    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));

    // CPU thread consumes results from GPU
    workload->success = true;

    for (int i=0; i< N_workloads; ++i)
    {
        workload->success &= workload->h_data[i] == i + workload->id + 1;
    }

    // Free Resources
    checkCudaErrors(cudaFree(workload->d_data));
    checkCudaErrors(cudaFreeHost(workload->h_data));
    checkCudaErrors(cudaStreamDestroy(workload->stream));

    // Signal the end of the heterogeneous workload to main thread
    cutIncrementBarrier(&thread_barrier);

    CUT_THREADEND;
}


void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
{
    // Check status of GPU after stream operations are done
    checkCudaErrors(status);

    // Spawn new CPU worker thread and continue processing on the CPU
    cutStartThread(postprocess, data);
}


int main(int argc, char **argv)
{
    int N_gpus, max_gpus = 0;
    int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration

    printf("Starting simpleCallback\n");

    checkCudaErrors(cudaGetDeviceCount(&N_gpus));
    printf("Found %d CUDA capable GPUs\n", N_gpus);

    if (N_gpus > 32)
    {
        printf("simpleCallback only supports 32 GPU(s)\n");
    }

    for (int devid=0; devid < N_gpus; devid++)
    {
        int SMversion;
        cudaDeviceProp deviceProp;
        cudaSetDevice(devid);
        cudaGetDeviceProperties(&deviceProp, devid);
        SMversion = deviceProp.major << 4 + deviceProp.minor;
        printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");

        if (SMversion >= 0x11)
        {
            gpuInfo[max_gpus++] = devid;
        }
    }

    printf("%d GPUs available to run Callback Functions\n", max_gpus);

    heterogeneous_workload *workloads;
    workloads = (heterogeneous_workload *) malloc(N_workloads * sizeof(heterogeneous_workload));;
    thread_barrier = cutCreateBarrier(N_workloads);

    // Main thread spawns a CPU worker thread for each heterogeneous workload
    printf("Starting %d heterogeneous computing workloads\n", N_workloads);

    for (int i=0; i< N_workloads; ++i)
    {
        workloads[i].id = i;
        workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;

        cutStartThread(launch, &workloads[i]);
    }

    // Sleep until all workloads have finished
    cutWaitForBarrier(&thread_barrier);
    printf("Total of %d workloads finished:\n", N_workloads);

    bool success = true;

    for (int i=0; i< N_workloads; ++i)
    {
        success &= workloads[i].success;
    }

    printf("%s\n", success ? "Success" : "Failure");

    free(workloads);

    exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// This example shows how to use the clock function to measure the performance of
// block of threads of a kernel accurately.
//

// Blocks are executed in parallel and out of order. Since there's no synchronization
// mechanism between blocks, we measure the clock once for each block. The clock
// samples are written to device memory.


// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.

extern "C" __global__  void timedReduction(const float *input, float *output, clock_t *timer)
{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();

    // Copy input.
    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];

    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)
    {
        __syncthreads();

        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];

            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }

    // Write result.
    if (tid == 0) output[bid] = shared[0];

    __syncthreads();

    if (tid == 0) timer[bid+gridDim.x] = clock();
}

////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

/* Example of integrating CUDA functions into an existing
 * application / framework.
 * Host part of the device code.
 * Compiled with Cuda compiler.
 */

// System includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

////////////////////////////////////////////////////////////////////////////////
// declaration, forward

extern "C" void computeGold(char *reference, char *idata, const unsigned int len);
extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);

///////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_odata  memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(int *g_data)
{
    // write data to global memory
    const unsigned int tid = threadIdx.x;
    int data = g_data[tid];

    // use integer arithmetic to process all four bytes with one thread
    // this serializes the execution, but is the simplest solutions to avoid
    // bank conflicts for this very low number of threads
    // in general it is more efficient to process each byte by a separate thread,
    // to avoid bank conflicts the access pattern should be
    // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
    // and wid is the warp id
    // see also the programming guide for a more in depth discussion.
    g_data[tid] = ((((data <<  0) >> 24) - 10) << 24)
                  | ((((data <<  8) >> 24) - 10) << 16)
                  | ((((data << 16) >> 24) - 10) <<  8)
                  | ((((data << 24) >> 24) - 10) <<  0);
}

///////////////////////////////////////////////////////////////////////////////
//! Demonstration that int2 data can be used in the cpp code
//! @param g_odata  memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void
kernel2(int2 *g_data)
{
    // write data to global memory
    const unsigned int tid = threadIdx.x;
    int2 data = g_data[tid];

    // use integer arithmetic to process all four bytes with one thread
    // this serializes the execution, but is the simplest solutions to avoid
    // bank conflicts for this very low number of threads
    // in general it is more efficient to process each byte by a separate thread,
    // to avoid bank conflicts the access pattern should be
    // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
    // and wid is the warp id
    // see also the programming guide for a more in depth discussion.
    g_data[tid].x = data.x - data.y;
}

////////////////////////////////////////////////////////////////////////////////
//! Entry point for Cuda functionality on host side
//! @param argc  command line argument count
//! @param argv  command line arguments
//! @param data  data to process on the device
//! @param len   len of \a data
////////////////////////////////////////////////////////////////////////////////
extern "C" bool
runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len)
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    const unsigned int num_threads = len / 4;
    assert(0 == (len % 4));
    const unsigned int mem_size = sizeof(char) * len;
    const unsigned int mem_size_int2 = sizeof(int2) * len;

    // allocate device memory
    char *d_data;
    checkCudaErrors(cudaMalloc((void **) &d_data, mem_size));
    // copy host memory to device
    checkCudaErrors(cudaMemcpy(d_data, data, mem_size,
                               cudaMemcpyHostToDevice));
    // allocate device memory for int2 version
    int2 *d_data_int2;
    checkCudaErrors(cudaMalloc((void **) &d_data_int2, mem_size_int2));
    // copy host memory to device
    checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2,
                               cudaMemcpyHostToDevice));

    // setup execution parameters
    dim3 grid(1, 1, 1);
    dim3 threads(num_threads, 1, 1);
    dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version
    // execute the kernel
    kernel<<< grid, threads >>>((int *) d_data);
    kernel2<<< grid, threads2 >>>(d_data_int2);

    // check if kernel execution generated and error
    getLastCudaError("Kernel execution failed");

    // compute reference solutions
    char *reference = (char *) malloc(mem_size);
    computeGold(reference, data, len);
    int2 *reference2 = (int2 *) malloc(mem_size_int2);
    computeGold2(reference2, data_int2, len);

    // copy results from device to host
    checkCudaErrors(cudaMemcpy(data, d_data, mem_size,
                               cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2,
                               cudaMemcpyDeviceToHost));

    // check result
    bool success = true;

    for (unsigned int i = 0; i < len; i++)
    {
        if (reference[i] != data[i] ||
            reference2[i].x != data_int2[i].x ||
            reference2[i].y != data_int2[i].y)
        {
            success = false;
        }
    }

    // cleanup memory
    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFree(d_data_int2));
    free(reference);
    free(reference2);

    return success;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This application demonstrates how to use the CUDA API to use multiple GPUs,
 * with an emphasis on simple illustration of the techniques (not on performance).
 *
 * Note that in order to detect multiple GPUs in your system you have to disable
 * SLI in the nvidia control panel. Otherwise only one GPU is visible to the
 * application. On the other side, you can still extend your desktop to screens
 * attached to both GPUs.
 */

// System includes
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <timer.h>

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

#include "simpleMultiGPU.h"

////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
const int MAX_GPU_COUNT = 32;
const int DATA_N        = 1048576 * 32;

////////////////////////////////////////////////////////////////////////////////
// Simple reduction kernel.
// Refer to the 'reduction' CUDA Sample describing
// reduction optimization strategies
////////////////////////////////////////////////////////////////////////////////
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
{
    const int     tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int threadN = gridDim.x * blockDim.x;
    float sum = 0;

    for (int pos = tid; pos < N; pos += threadN)
        sum += d_Input[pos];

    d_Result[tid] = sum;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    //Solver config
    TGPUplan      plan[MAX_GPU_COUNT];

    //GPU reduction results
    float     h_SumGPU[MAX_GPU_COUNT];

    float sumGPU;
    double sumCPU, diff;

    int i, j, gpuBase, GPU_N;

    const int  BLOCK_N = 32;
    const int THREAD_N = 256;
    const int  ACCUM_N = BLOCK_N * THREAD_N;

    printf("Starting simpleMultiGPU\n");
    checkCudaErrors(cudaGetDeviceCount(&GPU_N));

    if (GPU_N > MAX_GPU_COUNT)
    {
        GPU_N = MAX_GPU_COUNT;
    }

    printf("CUDA-capable device count: %i\n", GPU_N);

    printf("Generating input data...\n\n");

    //Subdividing input data across GPUs
    //Get data sizes for each GPU
    for (i = 0; i < GPU_N; i++)
    {
        plan[i].dataN = DATA_N / GPU_N;
    }

    //Take into account "odd" data sizes
    for (i = 0; i < DATA_N % GPU_N; i++)
    {
        plan[i].dataN++;
    }

    //Assign data ranges to GPUs
    gpuBase = 0;

    for (i = 0; i < GPU_N; i++)
    {
        plan[i].h_Sum = h_SumGPU + i;
        gpuBase += plan[i].dataN;
    }

    //Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
    for (i = 0; i < GPU_N; i++)
    {
        checkCudaErrors(cudaSetDevice(i));
        checkCudaErrors(cudaStreamCreate(&plan[i].stream));
        //Allocate memory
        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));

        for (j = 0; j < plan[i].dataN; j++)
        {
            plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
        }
    }

    //Start timing and compute on GPU(s)
    printf("Computing with %d GPUs...\n", GPU_N);
    StartTimer();

    //Copy data to GPU, launch the kernel and copy data back. All asynchronously
    for (i = 0; i < GPU_N; i++)
    {
        //Set device
        checkCudaErrors(cudaSetDevice(i));

        //Copy input data from CPU
        checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));

        //Perform GPU computations
        reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
        getLastCudaError("reduceKernel() execution failed.\n");

        //Read back GPU results
        checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
    }

    //Process GPU results
    for (i = 0; i < GPU_N; i++)
    {
        float sum;

        //Set device
        checkCudaErrors(cudaSetDevice(i));

        //Wait for all operations to finish
        cudaStreamSynchronize(plan[i].stream);

        //Finalize GPU reduction for current subvector
        sum = 0;

        for (j = 0; j < ACCUM_N; j++)
        {
            sum += plan[i].h_Sum_from_device[j];
        }

        *(plan[i].h_Sum) = (float)sum;

        //Shut down this GPU
        checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
        checkCudaErrors(cudaFree(plan[i].d_Sum));
        checkCudaErrors(cudaFree(plan[i].d_Data));
        checkCudaErrors(cudaStreamDestroy(plan[i].stream));
    }

    sumGPU = 0;

    for (i = 0; i < GPU_N; i++)
    {
        sumGPU += h_SumGPU[i];
    }

    printf("  GPU Processing time: %f (ms)\n\n", GetTimer());

    // Compute on Host CPU
    printf("Computing with Host CPU...\n\n");

    sumCPU = 0;

    for (i = 0; i < GPU_N; i++)
    {
        for (j = 0; j < plan[i].dataN; j++)
        {
            sumCPU += plan[i].h_Data[j];
        }
    }

    // Compare GPU and CPU results
    printf("Comparing GPU and Host CPU results...\n");
    diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
    printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
    printf("  Relative difference: %E \n\n", diff);

    // Cleanup and shutdown
    for (i = 0; i < GPU_N; i++)
    {
        checkCudaErrors(cudaSetDevice(i));
        checkCudaErrors(cudaFreeHost(plan[i].h_Data));
    }

    exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
}
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 *
 * This sample is a simple code that illustrates basic usage of
 * cooperative groups within the thread block. The code launches a single
 * thread block, creates a cooperative group of all threads in the block,
 * and a set of tiled partition cooperative groups. For each, it uses a
 * generic reduction function to calculate the sum of all the ranks in
 * that group. In each case the result is printed, together with the
 * expected answer (which is calculated using the analytical formula
 * (n-1)*n)/2, noting that the ranks start at zero).
 *
 */

#include <stdio.h>
#include <cooperative_groups.h>

using namespace cooperative_groups;


/**
 * CUDA device function
 *
 * calculates the sum of val across the group g. The workspace array, x,
 * must be large enough to contain g.size() integers.
 */
__device__ int sumReduction(thread_group g, int *x, int val)
{
    // rank of this thread in the group
    int lane = g.thread_rank();

    // for each iteration of this loop, the number of threads active in the
    // reduction, i, is halved, and each active thread (with index [lane])
    // performs a single summation of it's own value with that
    // of a "partner" (with index [lane+i]).
    for (int i = g.size()/2; i > 0; i /= 2)
        {

            // store value for this thread in temporary array
            x[lane] = val;

            // synchronize all threads in group
            g.sync();


            if(lane<i)
                // active threads perform summation of their value with
                // their partner's value
                val += x[lane + i];

            // synchronize all threads in group
            g.sync();

        }

    // master thread in group returns result, and others return -1.
    if (g.thread_rank()==0)
        return val;
    else
        return -1;
}


/**
 * CUDA kernel device code
 *
 * Creates cooperative groups and performs reductions
 */
__global__ void cgkernel(){


    // threadBlockGroup includes all threads in the block
    thread_block threadBlockGroup = this_thread_block();
    int threadBlockGroupSize=threadBlockGroup.size();

    // workspace array in shared memory required for reduction
    extern __shared__ int workspace[];

    int input, output, expectedOutput;

    // input to reduction, for each thread, is its' rank in the group
    input=threadBlockGroup.thread_rank();

    // expected output from analytical formula (n-1)(n)/2
    // (noting that indexing starts at 0 rather than 1)
    expectedOutput=(threadBlockGroupSize-1)*threadBlockGroupSize/2;

    // perform reduction
    output=sumReduction(threadBlockGroup, workspace, input);

    // master thread in group prints out result
    if(threadBlockGroup.thread_rank()==0){

        printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
               threadBlockGroup.size()-1,output,
               expectedOutput);

        printf(" Now creating %d groups, each of size 16 threads:\n\n",
               threadBlockGroup.size()/16);

    }

    threadBlockGroup.sync();

    // each tiledPartition16 group includes 16 threads
    thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);

    // This offset allows each group to have its own unique area in the workspace array
    int workspaceOffset=threadBlockGroup.thread_rank()-tiledPartition16.thread_rank();

    // input to reduction, for each thread, is its' rank in the group
    input=tiledPartition16.thread_rank();

    // expected output from analytical formula (n-1)(n)/2
    // (noting that indexing starts at 0 rather than 1)
    expectedOutput=15*16/2;

    // Perform reduction
    output=sumReduction(tiledPartition16, workspace+workspaceOffset, input);

    // each master thread prints out result
    if(tiledPartition16.thread_rank()==0)
        printf("   Sum of all ranks 0..15 in this tiledPartition16 group is %d (expected %d)\n",output,expectedOutput);

    return;

}


/**
 * Host main routine
 */
int main(){

    // Error code to check return values for CUDA calls
    cudaError_t err;

    //Launch the kernel

    int blocksPerGrid=1;
    int threadsPerBlock=64;

    printf("\nLaunching a single block with %d threads...\n\n",threadsPerBlock);

    // we use the optional third argument to specify the size
    // of shared memory required in the kernel
    cgkernel <<<blocksPerGrid,threadsPerBlock,threadsPerBlock*sizeof(int)>>> ();
    err = cudaDeviceSynchronize();

    if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }

    printf("\n...Done.\n\n");

    return 0;
}
/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/* Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 3
 * of the programming guide with some additions like error checking.
 *
 */

// Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < N)
        C[i] = A[i] + B[i];
}
/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <iostream>
#include <cstdio>
#include <helper_cuda.h>
#include <helper_string.h>

#define MAX_DEPTH       16
#define INSERTION_SORT  32

////////////////////////////////////////////////////////////////////////////////
// Selection sort used when depth gets too big or the number of elements drops
// below a threshold.
////////////////////////////////////////////////////////////////////////////////
__device__ void selection_sort(unsigned int *data, int left, int right)
{
    for (int i = left ; i <= right ; ++i)
    {
        unsigned min_val = data[i];
        int min_idx = i;

        // Find the smallest value in the range [left, right].
        for (int j = i+1 ; j <= right ; ++j)
        {
            unsigned val_j = data[j];

            if (val_j < min_val)
            {
                min_idx = j;
                min_val = val_j;
            }
        }

        // Swap the values.
        if (i != min_idx)
        {
            data[min_idx] = data[i];
            data[i] = min_val;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Very basic quicksort algorithm, recursively launching the next level.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
    // If we're too deep or there are few elements left, we use an insertion sort...
    if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT)
    {
        selection_sort(data, left, right);
        return;
    }

    unsigned int *lptr = data+left;
    unsigned int *rptr = data+right;
    unsigned int  pivot = data[(left+right)/2];

    // Do the partitioning.
    while (lptr <= rptr)
    {
        // Find the next left- and right-hand values to swap
        unsigned int lval = *lptr;
        unsigned int rval = *rptr;

        // Move the left pointer as long as the pointed element is smaller than the pivot.
        while (lval < pivot)
        {
            lptr++;
            lval = *lptr;
        }

        // Move the right pointer as long as the pointed element is larger than the pivot.
        while (rval > pivot)
        {
            rptr--;
            rval = *rptr;
        }

        // If the swap points are valid, do the swap!
        if (lptr <= rptr)
        {
            *lptr++ = rval;
            *rptr-- = lval;
        }
    }

    // Now the recursive part
    int nright = rptr - data;
    int nleft  = lptr - data;

    // Launch a new block to sort the left part.
    if (left < (rptr-data))
    {
        cudaStream_t s;
        cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
        cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1);
        cudaStreamDestroy(s);
    }

    // Launch a new block to sort the right part.
    if ((lptr-data) < right)
    {
        cudaStream_t s1;
        cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
        cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1);
        cudaStreamDestroy(s1);
    }
}

////////////////////////////////////////////////////////////////////////////////
// Call the quicksort kernel from the host.
////////////////////////////////////////////////////////////////////////////////
void run_qsort(unsigned int *data, unsigned int nitems)
{
    // Prepare CDP for the max depth 'MAX_DEPTH'.
    checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));

    // Launch on device
    int left = 0;
    int right = nitems-1;
    std::cout << "Launching kernel on the GPU" << std::endl;
    cdp_simple_quicksort<<< 1, 1 >>>(data, left, right, 0);
    checkCudaErrors(cudaDeviceSynchronize());
}

////////////////////////////////////////////////////////////////////////////////