Skip to content

Instantly share code, notes, and snippets.

@ShigekiKarita
Created January 31, 2019 03:13
Show Gist options
  • Save ShigekiKarita/025b7040873f6b32e14e5963520aef16 to your computer and use it in GitHub Desktop.
Save ShigekiKarita/025b7040873f6b32e14e5963520aef16 to your computer and use it in GitHub Desktop.
/*
Cooperative Groups requires CUDA9 or later.
see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cooperative-groups
*/
#include <assert.h>
#include <numeric>
#include <cmath>
#include <iostream>
#include <cooperative_groups.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include "timer.hpp"
#include "typed_argparser.hpp"
#define PARALLEL_FOR(i, n) \
for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
// Use 1024 threads per block, which requires cuda sm_2x or above
// #if __CUDA_ARCH__ >= 200
static const int CUDA_NUM_THREADS = 1024;
// #else
// static const int CUDA_NUM_THREADS = 512;
// #endif
namespace cg = cooperative_groups;
/** thread_group datatype provides
void sync(); // Synchronize the threads in the group
unsigned size(); // Total number of threads in the group
unsigned thread_rank(); // Rank of the calling thread within [0, size]
bool is_valid(); // Whether the group violated any APIconstraints
*/
/** thread_block datatype provides
dim3 group_index(); // 3-dimensional block index within the grid
dim3 thread_index(); // 3-dimensional thread index within the block
*/
template <typename T, typename Op, typename Group>
__device__ T reduce_threads(Group g, T *temp, T val, Op op) {
auto lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread adds its partial sum[i] to sum[lane+i]
#pragma unroll
for (auto i = g.size() / 2; i > 0; i /= 2) {
temp[lane] = val;
g.sync(); // wait for all threads to store
if(lane<i) val += temp[lane + i];
g.sync(); // wait for all threads to load
}
return val; // note: only thread 0 will return full sum
}
template <typename T, typename Op = thrust::plus<T> >
__global__ void sum_kernel_block(T *sum, const T *input, int n, T init=0, Op op={})
{
T thread_acc = init;
PARALLEL_FOR(i, n) {
thread_acc = op(thread_acc, input[i]);
}
#ifdef USE_SHFL
auto tile = cg::tiled_partition<USE_SHFL>(cg::this_thread_block());
for (auto i = tile.size() / 2; i > 0; i /= 2) {
thread_acc = op(thread_acc, tile.shfl_down(thread_acc, i));
}
if (tile.thread_rank() == 0) atomicAdd(sum, thread_acc);
#elif defined USE_TILE
extern __shared__ T temp[];
auto g = cg::this_thread_block();
static_assert(USE_TILE > 0, "USE_TILE should be > 0");
auto tile_size = USE_TILE;
auto tile_idx = g.thread_rank() / tile_size;
auto t = &temp[tile_size * tile_idx];
auto tile = cg::tiled_partition<USE_TILE>(g);
auto tile_sum = reduce_threads(tile, t, thread_acc);
if (tile.thread_rank() == 0) atomicAdd(sum, tile_sum);
#elif defined USE_SYNCTHREADS
extern __shared__ T temp[];
auto tid = threadIdx.x;
// auto g = cg::this_thread_block();
// printf("tid %d(%d) of %d(%d)\n",
// tid, g.thread_rank(),
// blockDim.x, g.size());
for (auto i = blockDim.x / 2; i > 0; i /= 2) {
temp[tid] = thread_acc;
__syncthreads();
if (tid < i) thread_acc = op(thread_acc, temp[tid + i]);
__syncthreads();
}
if (tid == 0) atomicAdd(sum, thread_acc);
#else
extern __shared__ T temp[];
auto g = cg::this_thread_block();
auto block_sum = reduce_sum(g, temp, thread_acc);
if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
#endif
}
bool is_close(float x, float y, float rtol=1e-5, float atol=1e-8) {
return std::abs(x - y) <= atol + rtol * std::abs(y);
}
int main(int argc, char* argv[]) {
typed_argparser::ArgParser parser(argc, argv);
int size = 1 << 16;
parser.add("--size", size, "number of elements to be summed");
if (parser.help_wanted) {
std::cout << parser.help_message() << std::endl;
return 1;
}
// generate random data on the host
thrust::host_vector<float> h_vec(size);
thrust::default_random_engine rng(123);
thrust::uniform_real_distribution<float> dist(0.0, 1.0);
// thrust::generate(h_vec.begin(), h_vec.end(),
// [&](){ return dist(rng); } );
thrust::fill(h_vec.begin(), h_vec.end(), 1.0f);
std::cout << "[std::accumulate] " << std::accumulate(h_vec.begin(), h_vec.end(), 0.0f, thrust::minus<float>()) << std::endl;
// transfer to device and compute sum
thrust::device_vector<float> d_vec = h_vec;
float ref = 0;
{
timer t;
ref = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::minus<float>());
std::cout << "[thrust::reduce] sum is " << ref << " (" << t.elapsed() << " sec)" << std::endl;
}
{
timer t;
thrust::device_vector<float> result(1);
dim3 block(CUDA_NUM_THREADS, 1, 1);
dim3 grid((d_vec.size() + block.x - 1) / block.x, 1, 1);
sum_kernel_block<<<block, grid
#ifndef USE_SHFL
, block.x * sizeof(float)
#endif
>>>(
thrust::raw_pointer_cast(result.data()),
thrust::raw_pointer_cast(d_vec.data()),
d_vec.size(),
0.0f,
thrust::minus<float>());
std::cout << "[sum_kernel_block] sum is " << result[0]
<< " (" << t.elapsed() << " sec)" << std::endl;
assert(is_close(ref, result[0]));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment