Created
January 31, 2019 03:13
-
-
Save ShigekiKarita/025b7040873f6b32e14e5963520aef16 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Cooperative Groups requires CUDA9 or later. | |
see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cooperative-groups | |
*/ | |
#include <assert.h> | |
#include <numeric> | |
#include <cmath> | |
#include <iostream> | |
#include <cooperative_groups.h> | |
#include <thrust/host_vector.h> | |
#include <thrust/device_vector.h> | |
#include <thrust/generate.h> | |
#include <thrust/reduce.h> | |
#include <thrust/functional.h> | |
#include <thrust/random.h> | |
#include "timer.hpp" | |
#include "typed_argparser.hpp" | |
#define PARALLEL_FOR(i, n) \ | |
for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
// #if __CUDA_ARCH__ >= 200 | |
static const int CUDA_NUM_THREADS = 1024; | |
// #else | |
// static const int CUDA_NUM_THREADS = 512; | |
// #endif | |
namespace cg = cooperative_groups; | |
/** thread_group datatype provides | |
void sync(); // Synchronize the threads in the group | |
unsigned size(); // Total number of threads in the group | |
unsigned thread_rank(); // Rank of the calling thread within [0, size] | |
bool is_valid(); // Whether the group violated any APIconstraints | |
*/ | |
/** thread_block datatype provides | |
dim3 group_index(); // 3-dimensional block index within the grid | |
dim3 thread_index(); // 3-dimensional thread index within the block | |
*/ | |
template <typename T, typename Op, typename Group> | |
__device__ T reduce_threads(Group g, T *temp, T val, Op op) { | |
auto lane = g.thread_rank(); | |
// Each iteration halves the number of active threads | |
// Each thread adds its partial sum[i] to sum[lane+i] | |
#pragma unroll | |
for (auto i = g.size() / 2; i > 0; i /= 2) { | |
temp[lane] = val; | |
g.sync(); // wait for all threads to store | |
if(lane<i) val += temp[lane + i]; | |
g.sync(); // wait for all threads to load | |
} | |
return val; // note: only thread 0 will return full sum | |
} | |
template <typename T, typename Op = thrust::plus<T> > | |
__global__ void sum_kernel_block(T *sum, const T *input, int n, T init=0, Op op={}) | |
{ | |
T thread_acc = init; | |
PARALLEL_FOR(i, n) { | |
thread_acc = op(thread_acc, input[i]); | |
} | |
#ifdef USE_SHFL | |
auto tile = cg::tiled_partition<USE_SHFL>(cg::this_thread_block()); | |
for (auto i = tile.size() / 2; i > 0; i /= 2) { | |
thread_acc = op(thread_acc, tile.shfl_down(thread_acc, i)); | |
} | |
if (tile.thread_rank() == 0) atomicAdd(sum, thread_acc); | |
#elif defined USE_TILE | |
extern __shared__ T temp[]; | |
auto g = cg::this_thread_block(); | |
static_assert(USE_TILE > 0, "USE_TILE should be > 0"); | |
auto tile_size = USE_TILE; | |
auto tile_idx = g.thread_rank() / tile_size; | |
auto t = &temp[tile_size * tile_idx]; | |
auto tile = cg::tiled_partition<USE_TILE>(g); | |
auto tile_sum = reduce_threads(tile, t, thread_acc); | |
if (tile.thread_rank() == 0) atomicAdd(sum, tile_sum); | |
#elif defined USE_SYNCTHREADS | |
extern __shared__ T temp[]; | |
auto tid = threadIdx.x; | |
// auto g = cg::this_thread_block(); | |
// printf("tid %d(%d) of %d(%d)\n", | |
// tid, g.thread_rank(), | |
// blockDim.x, g.size()); | |
for (auto i = blockDim.x / 2; i > 0; i /= 2) { | |
temp[tid] = thread_acc; | |
__syncthreads(); | |
if (tid < i) thread_acc = op(thread_acc, temp[tid + i]); | |
__syncthreads(); | |
} | |
if (tid == 0) atomicAdd(sum, thread_acc); | |
#else | |
extern __shared__ T temp[]; | |
auto g = cg::this_thread_block(); | |
auto block_sum = reduce_sum(g, temp, thread_acc); | |
if (g.thread_rank() == 0) atomicAdd(sum, block_sum); | |
#endif | |
} | |
bool is_close(float x, float y, float rtol=1e-5, float atol=1e-8) { | |
return std::abs(x - y) <= atol + rtol * std::abs(y); | |
} | |
int main(int argc, char* argv[]) { | |
typed_argparser::ArgParser parser(argc, argv); | |
int size = 1 << 16; | |
parser.add("--size", size, "number of elements to be summed"); | |
if (parser.help_wanted) { | |
std::cout << parser.help_message() << std::endl; | |
return 1; | |
} | |
// generate random data on the host | |
thrust::host_vector<float> h_vec(size); | |
thrust::default_random_engine rng(123); | |
thrust::uniform_real_distribution<float> dist(0.0, 1.0); | |
// thrust::generate(h_vec.begin(), h_vec.end(), | |
// [&](){ return dist(rng); } ); | |
thrust::fill(h_vec.begin(), h_vec.end(), 1.0f); | |
std::cout << "[std::accumulate] " << std::accumulate(h_vec.begin(), h_vec.end(), 0.0f, thrust::minus<float>()) << std::endl; | |
// transfer to device and compute sum | |
thrust::device_vector<float> d_vec = h_vec; | |
float ref = 0; | |
{ | |
timer t; | |
ref = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::minus<float>()); | |
std::cout << "[thrust::reduce] sum is " << ref << " (" << t.elapsed() << " sec)" << std::endl; | |
} | |
{ | |
timer t; | |
thrust::device_vector<float> result(1); | |
dim3 block(CUDA_NUM_THREADS, 1, 1); | |
dim3 grid((d_vec.size() + block.x - 1) / block.x, 1, 1); | |
sum_kernel_block<<<block, grid | |
#ifndef USE_SHFL | |
, block.x * sizeof(float) | |
#endif | |
>>>( | |
thrust::raw_pointer_cast(result.data()), | |
thrust::raw_pointer_cast(d_vec.data()), | |
d_vec.size(), | |
0.0f, | |
thrust::minus<float>()); | |
std::cout << "[sum_kernel_block] sum is " << result[0] | |
<< " (" << t.elapsed() << " sec)" << std::endl; | |
assert(is_close(ref, result[0])); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment