Skip to content

Instantly share code, notes, and snippets.

@chintak
Created April 20, 2018 15:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chintak/239602d79bf425e1a790e35ca3a787fd to your computer and use it in GitHub Desktop.
Save chintak/239602d79bf425e1a790e35ca3a787fd to your computer and use it in GitHub Desktop.
CUDA ``bincount`` and ``reduce`` op implementation
#ifndef _HELPER_H_
#define _HELPER_H_
#include <cuda_runtime.h>
#include <sys/types.h>
#include <cmath>
#include <functional>
#include <iostream>
#include <random>
#include <vector>
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define UMUL(a, b, c) (((a) * (b)) + (c))
#define PRINT_ARRAY(name, v, sz) \
{ \
auto s = ((sz < 20) ? sz : 20); \
std::cout << name; \
for (int i = 0; i < s; i++) \
std::cout << v[i] << ", "; \
std::cout << ((s == sz) ? "" : "...") << std::endl; \
}
#define PRINT_VALUE(name, v) std::cout << name << v << std::endl;
inline void
check(cudaError_t code, const char* file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(
stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
// This will output the proper CUDA error strings in the event that a CUDA host
// call returns an error
#define checkCudaErrors(val) check((val), __FILE__, __LINE__)
#endif
#include "helper.h"
#include "histogram.cuh"
#include "reduce.cuh"
__global__ void bincount_kernel(
int64_t size,
int64_t nbins,
size_t pitch,
const int64_t* d_idata,
int64_t* d_counts) {
//
extern __shared__ unsigned char smem[];
int* sHist = reinterpret_cast<int*>(smem);
auto tid = threadIdx.x;
auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x);
// reset shared mem
for (size_t i = tid; i < nbins; i += blockDim.x) {
sHist[i] = 0;
}
__syncthreads();
// inc counter
if (idx < size) {
int64_t val = d_idata[idx];
atomicAdd(&sHist[val], 1);
}
__syncthreads();
// move the bin values into global device memory d_counts
for (size_t i = tid; i < nbins; i += blockDim.x) {
int64_t* d_row = (int64_t*)((char*)d_counts + i * pitch);
d_row[blockIdx.x] = static_cast<int64_t>(sHist[i]);
}
}
void bincount_gpu(
int64_t size,
int64_t nThreads,
int64_t nBlocks,
int64_t nbins,
const int64_t* d_idata,
int64_t* d_counts,
int64_t min,
int64_t max) {
auto sumOp = [] __device__(int64_t & a, int64_t b) { a += b; };
int64_t* d_temp2d;
size_t pitch;
// alloc 2d device memory
checkCudaErrors(
cudaMallocPitch(&d_temp2d, &pitch, nBlocks * sizeof(int64_t), nbins));
// launch the bincount kernel
int smemSize = nbins * sizeof(int64_t);
bincount_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
size, nbins, pitch, d_idata, d_temp2d);
checkCudaErrors(cudaGetLastError());
// reduce each bin
int64_t dummy;
int nth = (nBlocks < nThreads) ? nBlocks : nThreads;
int nblk = (nBlocks + nth - 1) / nth;
int64_t* d_row;
for (int i = 0; i < nbins; i++) {
d_row = (int64_t*)((char*)d_temp2d + i * pitch);
reduce_gpu(nBlocks, nth, nblk, d_row, &d_counts[i], &dummy, sumOp, false);
}
// free 2d temp
cudaFree(d_temp2d);
}
void bincount_init(
int64_t size,
int64_t nThreads,
int64_t nBlocks,
const int64_t* d_idata,
int64_t* h_counts,
int64_t min,
int64_t max) {
auto nbins = max + 1;
auto histBytes = nbins * sizeof(int64_t);
int64_t* d_counts;
// alloc hist on device
checkCudaErrors(cudaMalloc(&d_counts, histBytes));
// kernel call
bincount_gpu(size, nThreads, nBlocks, nbins, d_idata, d_counts, min, max);
// copy result device to host
checkCudaErrors(
cudaMemcpy(h_counts, d_counts, histBytes, cudaMemcpyDeviceToHost));
PRINT_ARRAY("bincount GPU: ", h_counts, nbins);
cudaFree(d_counts);
}
#ifndef _HISTOGRAM_CUH_
#define _HISTOGRAM_CUH_
#include <sys/types.h>
__global__ void bincount_kernel(
int64_t size,
int64_t nbins,
size_t pitch,
const int64_t* d_idata,
int64_t* d_counts);
void bincount_gpu(
int64_t size,
int64_t nThreads,
int64_t nBlocks,
int64_t nbins,
const int64_t* d_idata,
int64_t* d_counts,
int64_t min,
int64_t max);
void bincount_init(
int64_t size,
int64_t nThreads,
int64_t nBlocks,
const int64_t* d_idata,
int64_t* h_counts,
int64_t min,
int64_t max);
#endif
NVCC=/usr/local/cuda/bin/nvcc
CCFLAGS=-std=c++11 --expt-extended-lambda -G -Xcompiler -rdynamic -m64 $(CFLAGS)
ARCH=-gencode arch=compute_35,code=sm_35
LIBS=-lcudart
TARGET=run
SOURCES = TestBincount.cu histogram.cu
OBJECTS = $(SOURCES:%.cu=%.o)
all: $(TARGET)
%.o: %.cu
$(NVCC) $(CCFLAGS) $(ARCH) -c $<
$(TARGET): $(OBJECTS)
$(NVCC) $(CCFLAGS) $(ARCH) -o $(TARGET) $^ $(LIBS)
clean: $(OBJECTS)
rm -f $^ $(TARGET)
#ifndef _REDUCE_CUH_
#define _REDUCE_CUH_
#include <sys/types.h>
#define LOWER_POW_2 \
[](int a) { \
int pow2; \
pow2 = 1 << static_cast<int>(floor(log2((float)a))); \
if (pow2 == a) \
pow2 >>= 1; \
return pow2; \
}
template <typename Op>
__global__ void reduce_kernel(
int size,
const int64_t* d_idata,
int64_t* d_odata,
int pow2,
Op op) {
extern __shared__ unsigned char smem[];
int64_t* sm = reinterpret_cast<int64_t*>(smem);
auto tid = threadIdx.x;
auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x);
// copy data from global memory into shared memory
sm[tid] = (idx < size) ? d_idata[idx] : -1; // use -1 to detect out of bounds
__syncthreads();
// perform reduction on shared memory in the block
for (int s = pow2; s > 0; s >>= 1) {
if (tid < s && (idx + s) < size) {
op(sm[tid], sm[tid + s]);
}
__syncthreads();
}
if (tid == 0)
d_odata[blockIdx.x] = sm[0];
}
template <typename Op>
void reduce_gpu(
int size,
int nThreads,
int nBlocks,
const int64_t* d_idata,
int64_t* d_odata,
int64_t* val,
Op op,
bool copyDtoH = true) {
int smemSize = (nThreads <= 32) ? 2 * nThreads * sizeof(int64_t)
: nThreads * sizeof(int64_t);
int pow2 = LOWER_POW_2(nThreads);
reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
size, d_idata, d_odata, pow2, op);
checkCudaErrors(cudaGetLastError());
while (nBlocks > 1) {
size = nBlocks;
nBlocks = (nBlocks + nThreads - 1) / nThreads;
reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
size, d_odata, d_odata, pow2, op);
checkCudaErrors(cudaGetLastError());
}
if (copyDtoH)
checkCudaErrors(
cudaMemcpy(val, d_odata, sizeof(int64_t), cudaMemcpyDeviceToHost));
}
#endif
// vanilla CUDA bincount implementation
#include "helper.h"
#include "histogram.cuh"
#include "reduce.cuh"
#ifndef INPUT_DATA_SIZE
#define INPUT_DATA_SIZE 10
#endif
#ifndef NTHREADS
#define NTHREADS 256
#endif
#define UNIFORM_DISTRIBUTION_HIGH 17
#define UNIFORM_DISTRIBUTION_LOW 4
#define ASSERT_EQUAL(a, b, n) \
{ \
bool f = true; \
for (int i = 0; i < (n); i++) { \
if ((a)[i] != (b)[i]) { \
f = false; \
break; \
} \
} \
std::cout << "GPU" << ((f) ? " == " : " != ") << "CPU\n"; \
}
// forward declarations
void randominit(int64_t* h_data);
template <typename Op>
void reduce_cpu(const int64_t* h_idata, Op op);
void bincount_cpu(
int64_t size,
const int64_t* h_data,
const int64_t* gpu_hist,
int64_t min,
int64_t max);
int main(int argc, char** argv) {
int64_t h_idata[INPUT_DATA_SIZE]{0};
auto minOp = [] __host__ __device__(int64_t & a, int64_t b) {
a = (a < b) ? a : b;
};
auto maxOp = [] __host__ __device__(int64_t & a, int64_t b) {
a = (a > b) ? a : b;
};
////////////////////////////////////////////////////////////////////////
// random init
randominit(h_idata);
// print input data
PRINT_ARRAY("Input: ", h_idata, INPUT_DATA_SIZE);
////////////////////////////////////////////////////////////////////////
// alloc host input data and copy from host to device
size_t size = INPUT_DATA_SIZE * sizeof(int64_t);
int64_t *d_idata, *d_odata;
checkCudaErrors(cudaMalloc(&d_idata, size));
checkCudaErrors(cudaMemcpy(d_idata, h_idata, size, cudaMemcpyHostToDevice));
////////////////////////////////////////////////////////////////////////
// call kernel and main gpu computation logic
int64_t minVal, maxVal;
int nThreads = NTHREADS;
int nBlocks = (INPUT_DATA_SIZE + nThreads - 1) / nThreads;
std::cout << "#threads: " << nThreads << " #blocks: " << nBlocks << std::endl;
// alloc device output data
checkCudaErrors(cudaMalloc(&d_odata, nBlocks * sizeof(int64_t)));
// find the min of the array
reduce_gpu(
INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &minVal, minOp);
reduce_cpu(h_idata, minOp);
PRINT_VALUE("Min GPU: ", minVal);
// find the max of the array - nbins
reduce_gpu(
INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &maxVal, maxOp);
reduce_cpu(h_idata, maxOp);
PRINT_VALUE("Max GPU: ", maxVal);
// do bincount calc
auto h_counts = (int64_t*)malloc((maxVal + 1) * sizeof(int64_t));
bincount_init(
INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, h_counts, minVal, maxVal);
bincount_cpu(INPUT_DATA_SIZE, h_idata, h_counts, minVal, maxVal);
free(h_counts);
////////////////////////////////////////////////////////////////////////
// free device memory
cudaFree(d_idata);
cudaFree(d_odata);
cudaDeviceReset();
cudaDeviceSynchronize();
return 0;
}
void randominit(int64_t* h_data) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distr(
UNIFORM_DISTRIBUTION_LOW, UNIFORM_DISTRIBUTION_HIGH);
for (int i = 0; i < INPUT_DATA_SIZE; i++) {
h_data[i] = static_cast<int64_t>(distr(gen));
}
}
template <typename Op>
void reduce_cpu(const int64_t* h_idata, Op op) {
int64_t output = h_idata[0];
for (int i = 1; i < INPUT_DATA_SIZE; i++) {
op(output, h_idata[i]);
}
PRINT_VALUE("XXX CPU: ", output);
}
void bincount_cpu(
int64_t size,
const int64_t* h_data,
const int64_t* gpu_hist,
int64_t min,
int64_t max) {
// cpu bincount implementation for verification
auto sz = max + 1;
int64_t* hist = (int64_t*)malloc(sz * sizeof(int64_t));
for (int i = 0; i < sz; i++) {
hist[i] = 0;
}
for (int i = 0; i < size; i++) {
hist[h_data[i]] += 1;
}
PRINT_ARRAY("bincount CPU: ", hist, sz);
ASSERT_EQUAL(gpu_hist, hist, sz);
free(hist);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment