Created
April 20, 2018 15:36
-
-
Save chintak/239602d79bf425e1a790e35ca3a787fd to your computer and use it in GitHub Desktop.
CUDA ``bincount`` and ``reduce`` op implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef _HELPER_H_ | |
#define _HELPER_H_ | |
#include <cuda_runtime.h> | |
#include <sys/types.h> | |
#include <cmath> | |
#include <functional> | |
#include <iostream> | |
#include <random> | |
#include <vector> | |
#define MIN(a, b) (((a) < (b)) ? (a) : (b)) | |
#define MAX(a, b) (((a) > (b)) ? (a) : (b)) | |
#define UMUL(a, b, c) (((a) * (b)) + (c)) | |
#define PRINT_ARRAY(name, v, sz) \ | |
{ \ | |
auto s = ((sz < 20) ? sz : 20); \ | |
std::cout << name; \ | |
for (int i = 0; i < s; i++) \ | |
std::cout << v[i] << ", "; \ | |
std::cout << ((s == sz) ? "" : "...") << std::endl; \ | |
} | |
#define PRINT_VALUE(name, v) std::cout << name << v << std::endl; | |
inline void | |
check(cudaError_t code, const char* file, int line, bool abort = true) { | |
if (code != cudaSuccess) { | |
fprintf( | |
stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line); | |
if (abort) | |
exit(code); | |
} | |
} | |
// This will output the proper CUDA error strings in the event that a CUDA host | |
// call returns an error | |
#define checkCudaErrors(val) check((val), __FILE__, __LINE__) | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "helper.h" | |
#include "histogram.cuh" | |
#include "reduce.cuh" | |
__global__ void bincount_kernel( | |
int64_t size, | |
int64_t nbins, | |
size_t pitch, | |
const int64_t* d_idata, | |
int64_t* d_counts) { | |
// | |
extern __shared__ unsigned char smem[]; | |
int* sHist = reinterpret_cast<int*>(smem); | |
auto tid = threadIdx.x; | |
auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x); | |
// reset shared mem | |
for (size_t i = tid; i < nbins; i += blockDim.x) { | |
sHist[i] = 0; | |
} | |
__syncthreads(); | |
// inc counter | |
if (idx < size) { | |
int64_t val = d_idata[idx]; | |
atomicAdd(&sHist[val], 1); | |
} | |
__syncthreads(); | |
// move the bin values into global device memory d_counts | |
for (size_t i = tid; i < nbins; i += blockDim.x) { | |
int64_t* d_row = (int64_t*)((char*)d_counts + i * pitch); | |
d_row[blockIdx.x] = static_cast<int64_t>(sHist[i]); | |
} | |
} | |
void bincount_gpu( | |
int64_t size, | |
int64_t nThreads, | |
int64_t nBlocks, | |
int64_t nbins, | |
const int64_t* d_idata, | |
int64_t* d_counts, | |
int64_t min, | |
int64_t max) { | |
auto sumOp = [] __device__(int64_t & a, int64_t b) { a += b; }; | |
int64_t* d_temp2d; | |
size_t pitch; | |
// alloc 2d device memory | |
checkCudaErrors( | |
cudaMallocPitch(&d_temp2d, &pitch, nBlocks * sizeof(int64_t), nbins)); | |
// launch the bincount kernel | |
int smemSize = nbins * sizeof(int64_t); | |
bincount_kernel<<<nBlocks, nThreads, smemSize, 0>>>( | |
size, nbins, pitch, d_idata, d_temp2d); | |
checkCudaErrors(cudaGetLastError()); | |
// reduce each bin | |
int64_t dummy; | |
int nth = (nBlocks < nThreads) ? nBlocks : nThreads; | |
int nblk = (nBlocks + nth - 1) / nth; | |
int64_t* d_row; | |
for (int i = 0; i < nbins; i++) { | |
d_row = (int64_t*)((char*)d_temp2d + i * pitch); | |
reduce_gpu(nBlocks, nth, nblk, d_row, &d_counts[i], &dummy, sumOp, false); | |
} | |
// free 2d temp | |
cudaFree(d_temp2d); | |
} | |
void bincount_init( | |
int64_t size, | |
int64_t nThreads, | |
int64_t nBlocks, | |
const int64_t* d_idata, | |
int64_t* h_counts, | |
int64_t min, | |
int64_t max) { | |
auto nbins = max + 1; | |
auto histBytes = nbins * sizeof(int64_t); | |
int64_t* d_counts; | |
// alloc hist on device | |
checkCudaErrors(cudaMalloc(&d_counts, histBytes)); | |
// kernel call | |
bincount_gpu(size, nThreads, nBlocks, nbins, d_idata, d_counts, min, max); | |
// copy result device to host | |
checkCudaErrors( | |
cudaMemcpy(h_counts, d_counts, histBytes, cudaMemcpyDeviceToHost)); | |
PRINT_ARRAY("bincount GPU: ", h_counts, nbins); | |
cudaFree(d_counts); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef _HISTOGRAM_CUH_ | |
#define _HISTOGRAM_CUH_ | |
#include <sys/types.h> | |
__global__ void bincount_kernel( | |
int64_t size, | |
int64_t nbins, | |
size_t pitch, | |
const int64_t* d_idata, | |
int64_t* d_counts); | |
void bincount_gpu( | |
int64_t size, | |
int64_t nThreads, | |
int64_t nBlocks, | |
int64_t nbins, | |
const int64_t* d_idata, | |
int64_t* d_counts, | |
int64_t min, | |
int64_t max); | |
void bincount_init( | |
int64_t size, | |
int64_t nThreads, | |
int64_t nBlocks, | |
const int64_t* d_idata, | |
int64_t* h_counts, | |
int64_t min, | |
int64_t max); | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NVCC=/usr/local/cuda/bin/nvcc | |
CCFLAGS=-std=c++11 --expt-extended-lambda -G -Xcompiler -rdynamic -m64 $(CFLAGS) | |
ARCH=-gencode arch=compute_35,code=sm_35 | |
LIBS=-lcudart | |
TARGET=run | |
SOURCES = TestBincount.cu histogram.cu | |
OBJECTS = $(SOURCES:%.cu=%.o) | |
all: $(TARGET) | |
%.o: %.cu | |
$(NVCC) $(CCFLAGS) $(ARCH) -c $< | |
$(TARGET): $(OBJECTS) | |
$(NVCC) $(CCFLAGS) $(ARCH) -o $(TARGET) $^ $(LIBS) | |
clean: $(OBJECTS) | |
rm -f $^ $(TARGET) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef _REDUCE_CUH_ | |
#define _REDUCE_CUH_ | |
#include <sys/types.h> | |
#define LOWER_POW_2 \ | |
[](int a) { \ | |
int pow2; \ | |
pow2 = 1 << static_cast<int>(floor(log2((float)a))); \ | |
if (pow2 == a) \ | |
pow2 >>= 1; \ | |
return pow2; \ | |
} | |
template <typename Op> | |
__global__ void reduce_kernel( | |
int size, | |
const int64_t* d_idata, | |
int64_t* d_odata, | |
int pow2, | |
Op op) { | |
extern __shared__ unsigned char smem[]; | |
int64_t* sm = reinterpret_cast<int64_t*>(smem); | |
auto tid = threadIdx.x; | |
auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x); | |
// copy data from global memory into shared memory | |
sm[tid] = (idx < size) ? d_idata[idx] : -1; // use -1 to detect out of bounds | |
__syncthreads(); | |
// perform reduction on shared memory in the block | |
for (int s = pow2; s > 0; s >>= 1) { | |
if (tid < s && (idx + s) < size) { | |
op(sm[tid], sm[tid + s]); | |
} | |
__syncthreads(); | |
} | |
if (tid == 0) | |
d_odata[blockIdx.x] = sm[0]; | |
} | |
template <typename Op> | |
void reduce_gpu( | |
int size, | |
int nThreads, | |
int nBlocks, | |
const int64_t* d_idata, | |
int64_t* d_odata, | |
int64_t* val, | |
Op op, | |
bool copyDtoH = true) { | |
int smemSize = (nThreads <= 32) ? 2 * nThreads * sizeof(int64_t) | |
: nThreads * sizeof(int64_t); | |
int pow2 = LOWER_POW_2(nThreads); | |
reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>( | |
size, d_idata, d_odata, pow2, op); | |
checkCudaErrors(cudaGetLastError()); | |
while (nBlocks > 1) { | |
size = nBlocks; | |
nBlocks = (nBlocks + nThreads - 1) / nThreads; | |
reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>( | |
size, d_odata, d_odata, pow2, op); | |
checkCudaErrors(cudaGetLastError()); | |
} | |
if (copyDtoH) | |
checkCudaErrors( | |
cudaMemcpy(val, d_odata, sizeof(int64_t), cudaMemcpyDeviceToHost)); | |
} | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// vanilla CUDA bincount implementation | |
#include "helper.h" | |
#include "histogram.cuh" | |
#include "reduce.cuh" | |
#ifndef INPUT_DATA_SIZE | |
#define INPUT_DATA_SIZE 10 | |
#endif | |
#ifndef NTHREADS | |
#define NTHREADS 256 | |
#endif | |
#define UNIFORM_DISTRIBUTION_HIGH 17 | |
#define UNIFORM_DISTRIBUTION_LOW 4 | |
#define ASSERT_EQUAL(a, b, n) \ | |
{ \ | |
bool f = true; \ | |
for (int i = 0; i < (n); i++) { \ | |
if ((a)[i] != (b)[i]) { \ | |
f = false; \ | |
break; \ | |
} \ | |
} \ | |
std::cout << "GPU" << ((f) ? " == " : " != ") << "CPU\n"; \ | |
} | |
// forward declarations | |
void randominit(int64_t* h_data); | |
template <typename Op> | |
void reduce_cpu(const int64_t* h_idata, Op op); | |
void bincount_cpu( | |
int64_t size, | |
const int64_t* h_data, | |
const int64_t* gpu_hist, | |
int64_t min, | |
int64_t max); | |
int main(int argc, char** argv) { | |
int64_t h_idata[INPUT_DATA_SIZE]{0}; | |
auto minOp = [] __host__ __device__(int64_t & a, int64_t b) { | |
a = (a < b) ? a : b; | |
}; | |
auto maxOp = [] __host__ __device__(int64_t & a, int64_t b) { | |
a = (a > b) ? a : b; | |
}; | |
//////////////////////////////////////////////////////////////////////// | |
// random init | |
randominit(h_idata); | |
// print input data | |
PRINT_ARRAY("Input: ", h_idata, INPUT_DATA_SIZE); | |
//////////////////////////////////////////////////////////////////////// | |
// alloc host input data and copy from host to device | |
size_t size = INPUT_DATA_SIZE * sizeof(int64_t); | |
int64_t *d_idata, *d_odata; | |
checkCudaErrors(cudaMalloc(&d_idata, size)); | |
checkCudaErrors(cudaMemcpy(d_idata, h_idata, size, cudaMemcpyHostToDevice)); | |
//////////////////////////////////////////////////////////////////////// | |
// call kernel and main gpu computation logic | |
int64_t minVal, maxVal; | |
int nThreads = NTHREADS; | |
int nBlocks = (INPUT_DATA_SIZE + nThreads - 1) / nThreads; | |
std::cout << "#threads: " << nThreads << " #blocks: " << nBlocks << std::endl; | |
// alloc device output data | |
checkCudaErrors(cudaMalloc(&d_odata, nBlocks * sizeof(int64_t))); | |
// find the min of the array | |
reduce_gpu( | |
INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &minVal, minOp); | |
reduce_cpu(h_idata, minOp); | |
PRINT_VALUE("Min GPU: ", minVal); | |
// find the max of the array - nbins | |
reduce_gpu( | |
INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &maxVal, maxOp); | |
reduce_cpu(h_idata, maxOp); | |
PRINT_VALUE("Max GPU: ", maxVal); | |
// do bincount calc | |
auto h_counts = (int64_t*)malloc((maxVal + 1) * sizeof(int64_t)); | |
bincount_init( | |
INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, h_counts, minVal, maxVal); | |
bincount_cpu(INPUT_DATA_SIZE, h_idata, h_counts, minVal, maxVal); | |
free(h_counts); | |
//////////////////////////////////////////////////////////////////////// | |
// free device memory | |
cudaFree(d_idata); | |
cudaFree(d_odata); | |
cudaDeviceReset(); | |
cudaDeviceSynchronize(); | |
return 0; | |
} | |
void randominit(int64_t* h_data) { | |
std::random_device rd; | |
std::mt19937 gen(rd()); | |
std::uniform_int_distribution<> distr( | |
UNIFORM_DISTRIBUTION_LOW, UNIFORM_DISTRIBUTION_HIGH); | |
for (int i = 0; i < INPUT_DATA_SIZE; i++) { | |
h_data[i] = static_cast<int64_t>(distr(gen)); | |
} | |
} | |
template <typename Op> | |
void reduce_cpu(const int64_t* h_idata, Op op) { | |
int64_t output = h_idata[0]; | |
for (int i = 1; i < INPUT_DATA_SIZE; i++) { | |
op(output, h_idata[i]); | |
} | |
PRINT_VALUE("XXX CPU: ", output); | |
} | |
void bincount_cpu( | |
int64_t size, | |
const int64_t* h_data, | |
const int64_t* gpu_hist, | |
int64_t min, | |
int64_t max) { | |
// cpu bincount implementation for verification | |
auto sz = max + 1; | |
int64_t* hist = (int64_t*)malloc(sz * sizeof(int64_t)); | |
for (int i = 0; i < sz; i++) { | |
hist[i] = 0; | |
} | |
for (int i = 0; i < size; i++) { | |
hist[h_data[i]] += 1; | |
} | |
PRINT_ARRAY("bincount CPU: ", hist, sz); | |
ASSERT_EQUAL(gpu_hist, hist, sz); | |
free(hist); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment