ShigekiKarita/cooperative_groups.cu

## cooperative_groups.cu
/*
  Cooperative Groups requires CUDA9 or later.

  see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cooperative-groups
*/

#include <assert.h>
#include <numeric>
#include <cmath>
#include <iostream>
#include <cooperative_groups.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>

#include "timer.hpp"
#include "typed_argparser.hpp"


#define PARALLEL_FOR(i, n)                                          \
    for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

// Use 1024 threads per block, which requires cuda sm_2x or above
// #if __CUDA_ARCH__ >= 200
static const int CUDA_NUM_THREADS = 1024;
// #else
// static const int CUDA_NUM_THREADS = 512;
// #endif


namespace cg = cooperative_groups;

/** thread_group datatype provides
    void sync(); // Synchronize the threads in the group
    unsigned size(); // Total number of threads in the group
    unsigned thread_rank(); // Rank of the calling thread within [0, size]
    bool is_valid(); // Whether the group violated any APIconstraints
*/

/** thread_block datatype provides
    dim3 group_index(); // 3-dimensional block index within the grid
    dim3 thread_index(); // 3-dimensional thread index within the block
*/


template <typename T, typename Op, typename Group>
__device__ T reduce_threads(Group g, T *temp, T val, Op op) {
    auto lane = g.thread_rank();
    // Each iteration halves the number of active threads
    // Each thread adds its partial sum[i] to sum[lane+i]
#pragma unroll
    for (auto i = g.size() / 2; i > 0; i /= 2) {
        temp[lane] = val;
        g.sync(); // wait for all threads to store
        if(lane<i) val += temp[lane + i];
        g.sync(); // wait for all threads to load
    }
    return val; // note: only thread 0 will return full sum
}


template <typename T, typename Op = thrust::plus<T> >
__global__ void sum_kernel_block(T *sum, const T *input, int n, T init=0, Op op={})
{
    T thread_acc = init;
    PARALLEL_FOR(i, n) {
        thread_acc = op(thread_acc, input[i]);
    }

#ifdef USE_SHFL
    auto tile = cg::tiled_partition<USE_SHFL>(cg::this_thread_block());
    for (auto i = tile.size() / 2; i > 0; i /= 2) {
        thread_acc = op(thread_acc, tile.shfl_down(thread_acc, i));
    }
    if (tile.thread_rank() == 0) atomicAdd(sum, thread_acc);
#elif defined USE_TILE
    extern __shared__ T temp[];
    auto g = cg::this_thread_block();
    static_assert(USE_TILE > 0, "USE_TILE should be > 0");
    auto tile_size = USE_TILE;
    auto tile_idx = g.thread_rank() / tile_size;
    auto t = &temp[tile_size * tile_idx];
    auto tile = cg::tiled_partition<USE_TILE>(g);
    auto tile_sum = reduce_threads(tile, t, thread_acc);
    if (tile.thread_rank() == 0) atomicAdd(sum, tile_sum);
#elif defined USE_SYNCTHREADS
    extern __shared__ T temp[];
    auto tid = threadIdx.x;
    // auto g = cg::this_thread_block();
    // printf("tid %d(%d) of %d(%d)\n",
    //        tid, g.thread_rank(),
    //        blockDim.x, g.size());
    for (auto i = blockDim.x / 2; i > 0; i /= 2) {
        temp[tid] = thread_acc;
        __syncthreads();
        if (tid < i) thread_acc = op(thread_acc, temp[tid + i]);
        __syncthreads();
    }
    if (tid == 0) atomicAdd(sum, thread_acc);
#else
    extern __shared__ T temp[];
    auto g = cg::this_thread_block();
    auto block_sum = reduce_sum(g, temp, thread_acc);
    if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
#endif
}

bool is_close(float x, float y, float rtol=1e-5, float atol=1e-8) {
    return std::abs(x - y) <= atol + rtol * std::abs(y);
}

int main(int argc, char* argv[]) {
    typed_argparser::ArgParser parser(argc, argv);
    int size = 1 << 16;
    parser.add("--size", size, "number of elements to be summed");
    if (parser.help_wanted) {
        std::cout << parser.help_message() << std::endl;
        return 1;
    }

    // generate random data on the host
    thrust::host_vector<float> h_vec(size);
    thrust::default_random_engine rng(123);
    thrust::uniform_real_distribution<float> dist(0.0, 1.0);
    // thrust::generate(h_vec.begin(), h_vec.end(),
    //                  [&](){ return dist(rng); } );
    thrust::fill(h_vec.begin(), h_vec.end(), 1.0f);

    std::cout << "[std::accumulate] " << std::accumulate(h_vec.begin(), h_vec.end(), 0.0f, thrust::minus<float>()) << std::endl;

    // transfer to device and compute sum
    thrust::device_vector<float> d_vec = h_vec;
    float ref = 0;
    {
        timer t;
        ref = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::minus<float>());
        std::cout << "[thrust::reduce] sum is " << ref << " (" << t.elapsed() << " sec)" << std::endl;
    }

    {
        timer t;
        thrust::device_vector<float> result(1);
        dim3 block(CUDA_NUM_THREADS, 1, 1);
        dim3 grid((d_vec.size() + block.x - 1) / block.x, 1, 1);
        sum_kernel_block<<<block, grid
#ifndef USE_SHFL
            , block.x * sizeof(float)
#endif
                        >>>(
            thrust::raw_pointer_cast(result.data()),
            thrust::raw_pointer_cast(d_vec.data()),
            d_vec.size(),
            0.0f,
            thrust::minus<float>());
        std::cout << "[sum_kernel_block] sum is " << result[0]
                  << " (" << t.elapsed() << " sec)" << std::endl;
        assert(is_close(ref, result[0]));
    }
}
	/*
	Cooperative Groups requires CUDA9 or later.

	see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cooperative-groups
	*/

	#include <assert.h>
	#include <numeric>
	#include <cmath>
	#include <iostream>
	#include <cooperative_groups.h>
	#include <thrust/host_vector.h>
	#include <thrust/device_vector.h>
	#include <thrust/generate.h>
	#include <thrust/reduce.h>
	#include <thrust/functional.h>
	#include <thrust/random.h>

	#include "timer.hpp"
	#include "typed_argparser.hpp"


	#define PARALLEL_FOR(i, n) \
	for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	// #if __CUDA_ARCH__ >= 200
	static const int CUDA_NUM_THREADS = 1024;
	// #else
	// static const int CUDA_NUM_THREADS = 512;
	// #endif


	namespace cg = cooperative_groups;

	/** thread_group datatype provides
	void sync(); // Synchronize the threads in the group
	unsigned size(); // Total number of threads in the group
	unsigned thread_rank(); // Rank of the calling thread within [0, size]
	bool is_valid(); // Whether the group violated any APIconstraints
	*/

	/** thread_block datatype provides
	dim3 group_index(); // 3-dimensional block index within the grid
	dim3 thread_index(); // 3-dimensional thread index within the block
	*/


	template <typename T, typename Op, typename Group>
	__device__ T reduce_threads(Group g, T *temp, T val, Op op) {
	auto lane = g.thread_rank();
	// Each iteration halves the number of active threads
	// Each thread adds its partial sum[i] to sum[lane+i]
	#pragma unroll
	for (auto i = g.size() / 2; i > 0; i /= 2) {
	temp[lane] = val;
	g.sync(); // wait for all threads to store
	if(lane<i) val += temp[lane + i];
	g.sync(); // wait for all threads to load
	}
	return val; // note: only thread 0 will return full sum
	}


	template <typename T, typename Op = thrust::plus<T> >
	__global__ void sum_kernel_block(T sum, const T input, int n, T init=0, Op op={})
	{
	T thread_acc = init;
	PARALLEL_FOR(i, n) {
	thread_acc = op(thread_acc, input[i]);
	}

	#ifdef USE_SHFL
	auto tile = cg::tiled_partition<USE_SHFL>(cg::this_thread_block());
	for (auto i = tile.size() / 2; i > 0; i /= 2) {
	thread_acc = op(thread_acc, tile.shfl_down(thread_acc, i));
	}
	if (tile.thread_rank() == 0) atomicAdd(sum, thread_acc);
	#elif defined USE_TILE
	extern __shared__ T temp[];
	auto g = cg::this_thread_block();
	static_assert(USE_TILE > 0, "USE_TILE should be > 0");
	auto tile_size = USE_TILE;
	auto tile_idx = g.thread_rank() / tile_size;
	auto t = &temp[tile_size * tile_idx];
	auto tile = cg::tiled_partition<USE_TILE>(g);
	auto tile_sum = reduce_threads(tile, t, thread_acc);
	if (tile.thread_rank() == 0) atomicAdd(sum, tile_sum);
	#elif defined USE_SYNCTHREADS
	extern __shared__ T temp[];
	auto tid = threadIdx.x;
	// auto g = cg::this_thread_block();
	// printf("tid %d(%d) of %d(%d)\n",
	// tid, g.thread_rank(),
	// blockDim.x, g.size());
	for (auto i = blockDim.x / 2; i > 0; i /= 2) {
	temp[tid] = thread_acc;
	__syncthreads();
	if (tid < i) thread_acc = op(thread_acc, temp[tid + i]);
	__syncthreads();
	}
	if (tid == 0) atomicAdd(sum, thread_acc);
	#else
	extern __shared__ T temp[];
	auto g = cg::this_thread_block();
	auto block_sum = reduce_sum(g, temp, thread_acc);
	if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
	#endif
	}

	bool is_close(float x, float y, float rtol=1e-5, float atol=1e-8) {
	return std::abs(x - y) <= atol + rtol * std::abs(y);
	}

	int main(int argc, char* argv[]) {
	typed_argparser::ArgParser parser(argc, argv);
	int size = 1 << 16;
	parser.add("--size", size, "number of elements to be summed");
	if (parser.help_wanted) {
	std::cout << parser.help_message() << std::endl;
	return 1;
	}

	// generate random data on the host
	thrust::host_vector<float> h_vec(size);
	thrust::default_random_engine rng(123);
	thrust::uniform_real_distribution<float> dist(0.0, 1.0);
	// thrust::generate(h_vec.begin(), h_vec.end(),
	// [&](){ return dist(rng); } );
	thrust::fill(h_vec.begin(), h_vec.end(), 1.0f);

	std::cout << "[std::accumulate] " << std::accumulate(h_vec.begin(), h_vec.end(), 0.0f, thrust::minus<float>()) << std::endl;

	// transfer to device and compute sum
	thrust::device_vector<float> d_vec = h_vec;
	float ref = 0;
	{
	timer t;
	ref = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::minus<float>());
	std::cout << "[thrust::reduce] sum is " << ref << " (" << t.elapsed() << " sec)" << std::endl;
	}

	{
	timer t;
	thrust::device_vector<float> result(1);
	dim3 block(CUDA_NUM_THREADS, 1, 1);
	dim3 grid((d_vec.size() + block.x - 1) / block.x, 1, 1);
	sum_kernel_block<<<block, grid
	#ifndef USE_SHFL
	, block.x * sizeof(float)
	#endif
	>>>(
	thrust::raw_pointer_cast(result.data()),
	thrust::raw_pointer_cast(d_vec.data()),
	d_vec.size(),
	0.0f,
	thrust::minus<float>());
	std::cout << "[sum_kernel_block] sum is " << result[0]
	<< " (" << t.elapsed() << " sec)" << std::endl;
	assert(is_close(ref, result[0]));
	}
	}