chintak/Makefile

## helper.h
#ifndef _HELPER_H_
#define _HELPER_H_

#include <cuda_runtime.h>

#include <sys/types.h>
#include <cmath>
#include <functional>
#include <iostream>
#include <random>
#include <vector>

#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define UMUL(a, b, c) (((a) * (b)) + (c))

#define PRINT_ARRAY(name, v, sz)                        \
  {                                                     \
    auto s = ((sz < 20) ? sz : 20);                     \
    std::cout << name;                                  \
    for (int i = 0; i < s; i++)                         \
      std::cout << v[i] << ", ";                        \
    std::cout << ((s == sz) ? "" : "...") << std::endl; \
  }
#define PRINT_VALUE(name, v) std::cout << name << v << std::endl;

inline void
check(cudaError_t code, const char* file, int line, bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(
        stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort)
      exit(code);
  }
}

// This will output the proper CUDA error strings in the event that a CUDA host
// call returns an error
#define checkCudaErrors(val) check((val), __FILE__, __LINE__)

#endif

## histogram.cu
#include "helper.h"
#include "histogram.cuh"
#include "reduce.cuh"

__global__ void bincount_kernel(
    int64_t size,
    int64_t nbins,
    size_t pitch,
    const int64_t* d_idata,
    int64_t* d_counts) {
  //
  extern __shared__ unsigned char smem[];
  int* sHist = reinterpret_cast<int*>(smem);

  auto tid = threadIdx.x;
  auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x);

  // reset shared mem
  for (size_t i = tid; i < nbins; i += blockDim.x) {
    sHist[i] = 0;
  }
  __syncthreads();
  // inc counter
  if (idx < size) {
    int64_t val = d_idata[idx];
    atomicAdd(&sHist[val], 1);
  }
  __syncthreads();
  // move the bin values into global device memory d_counts
  for (size_t i = tid; i < nbins; i += blockDim.x) {
    int64_t* d_row = (int64_t*)((char*)d_counts + i * pitch);
    d_row[blockIdx.x] = static_cast<int64_t>(sHist[i]);
  }
}

void bincount_gpu(
    int64_t size,
    int64_t nThreads,
    int64_t nBlocks,
    int64_t nbins,
    const int64_t* d_idata,
    int64_t* d_counts,
    int64_t min,
    int64_t max) {
  auto sumOp = [] __device__(int64_t & a, int64_t b) { a += b; };
  int64_t* d_temp2d;
  size_t pitch;
  // alloc 2d device memory
  checkCudaErrors(
      cudaMallocPitch(&d_temp2d, &pitch, nBlocks * sizeof(int64_t), nbins));
  // launch the bincount kernel
  int smemSize = nbins * sizeof(int64_t);
  bincount_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
      size, nbins, pitch, d_idata, d_temp2d);
  checkCudaErrors(cudaGetLastError());

  // reduce each bin
  int64_t dummy;
  int nth = (nBlocks < nThreads) ? nBlocks : nThreads;
  int nblk = (nBlocks + nth - 1) / nth;
  int64_t* d_row;
  for (int i = 0; i < nbins; i++) {
    d_row = (int64_t*)((char*)d_temp2d + i * pitch);
    reduce_gpu(nBlocks, nth, nblk, d_row, &d_counts[i], &dummy, sumOp, false);
  }
  // free 2d temp
  cudaFree(d_temp2d);
}

void bincount_init(
    int64_t size,
    int64_t nThreads,
    int64_t nBlocks,
    const int64_t* d_idata,
    int64_t* h_counts,
    int64_t min,
    int64_t max) {
  auto nbins = max + 1;
  auto histBytes = nbins * sizeof(int64_t);
  int64_t* d_counts;

  // alloc hist on device
  checkCudaErrors(cudaMalloc(&d_counts, histBytes));
  // kernel call
  bincount_gpu(size, nThreads, nBlocks, nbins, d_idata, d_counts, min, max);
  // copy result device to host
  checkCudaErrors(
      cudaMemcpy(h_counts, d_counts, histBytes, cudaMemcpyDeviceToHost));
  PRINT_ARRAY("bincount GPU: ", h_counts, nbins);
  cudaFree(d_counts);
}

## histogram.cuh
#ifndef _HISTOGRAM_CUH_
#define _HISTOGRAM_CUH_

#include <sys/types.h>

__global__ void bincount_kernel(
    int64_t size,
    int64_t nbins,
    size_t pitch,
    const int64_t* d_idata,
    int64_t* d_counts);

void bincount_gpu(
    int64_t size,
    int64_t nThreads,
    int64_t nBlocks,
    int64_t nbins,
    const int64_t* d_idata,
    int64_t* d_counts,
    int64_t min,
    int64_t max);

void bincount_init(
    int64_t size,
    int64_t nThreads,
    int64_t nBlocks,
    const int64_t* d_idata,
    int64_t* h_counts,
    int64_t min,
    int64_t max);

#endif

## Makefile
NVCC=/usr/local/cuda/bin/nvcc
CCFLAGS=-std=c++11 --expt-extended-lambda -G -Xcompiler -rdynamic -m64 $(CFLAGS)
ARCH=-gencode arch=compute_35,code=sm_35
LIBS=-lcudart
TARGET=run

SOURCES = TestBincount.cu histogram.cu
OBJECTS = $(SOURCES:%.cu=%.o)

all: $(TARGET)

%.o: %.cu
	$(NVCC) $(CCFLAGS) $(ARCH) -c $<

$(TARGET): $(OBJECTS)
	$(NVCC) $(CCFLAGS) $(ARCH) -o $(TARGET) $^ $(LIBS)

clean: $(OBJECTS)
	rm -f $^ $(TARGET)

## reduce.cuh
#ifndef _REDUCE_CUH_
#define _REDUCE_CUH_

#include <sys/types.h>

#define LOWER_POW_2                                      \
  [](int a) {                                            \
    int pow2;                                            \
    pow2 = 1 << static_cast<int>(floor(log2((float)a))); \
    if (pow2 == a)                                       \
      pow2 >>= 1;                                        \
    return pow2;                                         \
  }

template <typename Op>
__global__ void reduce_kernel(
    int size,
    const int64_t* d_idata,
    int64_t* d_odata,
    int pow2,
    Op op) {
  extern __shared__ unsigned char smem[];
  int64_t* sm = reinterpret_cast<int64_t*>(smem);

  auto tid = threadIdx.x;
  auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x);

  // copy data from global memory into shared memory
  sm[tid] = (idx < size) ? d_idata[idx] : -1; // use -1 to detect out of bounds
  __syncthreads();
  // perform reduction on shared memory in the block
  for (int s = pow2; s > 0; s >>= 1) {
    if (tid < s && (idx + s) < size) {
      op(sm[tid], sm[tid + s]);
    }
    __syncthreads();
  }
  if (tid == 0)
    d_odata[blockIdx.x] = sm[0];
}

template <typename Op>
void reduce_gpu(
    int size,
    int nThreads,
    int nBlocks,
    const int64_t* d_idata,
    int64_t* d_odata,
    int64_t* val,
    Op op,
    bool copyDtoH = true) {
  int smemSize = (nThreads <= 32) ? 2 * nThreads * sizeof(int64_t)
                                  : nThreads * sizeof(int64_t);
  int pow2 = LOWER_POW_2(nThreads);
  reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
      size, d_idata, d_odata, pow2, op);
  checkCudaErrors(cudaGetLastError());

  while (nBlocks > 1) {
    size = nBlocks;
    nBlocks = (nBlocks + nThreads - 1) / nThreads;
    reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
        size, d_odata, d_odata, pow2, op);
    checkCudaErrors(cudaGetLastError());
  }
  if (copyDtoH)
    checkCudaErrors(
        cudaMemcpy(val, d_odata, sizeof(int64_t), cudaMemcpyDeviceToHost));
}

#endif

## TestBincount.cu
// vanilla CUDA bincount implementation

#include "helper.h"
#include "histogram.cuh"
#include "reduce.cuh"

#ifndef INPUT_DATA_SIZE
#define INPUT_DATA_SIZE 10
#endif

#ifndef NTHREADS
#define NTHREADS 256
#endif

#define UNIFORM_DISTRIBUTION_HIGH 17
#define UNIFORM_DISTRIBUTION_LOW 4

#define ASSERT_EQUAL(a, b, n)                                 \
  {                                                           \
    bool f = true;                                            \
    for (int i = 0; i < (n); i++) {                           \
      if ((a)[i] != (b)[i]) {                                 \
        f = false;                                            \
        break;                                                \
      }                                                       \
    }                                                         \
    std::cout << "GPU" << ((f) ? " == " : " != ") << "CPU\n"; \
  }

// forward declarations
void randominit(int64_t* h_data);
template <typename Op>
void reduce_cpu(const int64_t* h_idata, Op op);
void bincount_cpu(
    int64_t size,
    const int64_t* h_data,
    const int64_t* gpu_hist,
    int64_t min,
    int64_t max);

int main(int argc, char** argv) {
  int64_t h_idata[INPUT_DATA_SIZE]{0};
  auto minOp = [] __host__ __device__(int64_t & a, int64_t b) {
    a = (a < b) ? a : b;
  };
  auto maxOp = [] __host__ __device__(int64_t & a, int64_t b) {
    a = (a > b) ? a : b;
  };
  ////////////////////////////////////////////////////////////////////////
  // random init
  randominit(h_idata);
  // print input data
  PRINT_ARRAY("Input: ", h_idata, INPUT_DATA_SIZE);

  ////////////////////////////////////////////////////////////////////////
  // alloc host input data and copy from host to device
  size_t size = INPUT_DATA_SIZE * sizeof(int64_t);
  int64_t *d_idata, *d_odata;
  checkCudaErrors(cudaMalloc(&d_idata, size));
  checkCudaErrors(cudaMemcpy(d_idata, h_idata, size, cudaMemcpyHostToDevice));

  ////////////////////////////////////////////////////////////////////////
  // call kernel and main gpu computation logic
  int64_t minVal, maxVal;
  int nThreads = NTHREADS;
  int nBlocks = (INPUT_DATA_SIZE + nThreads - 1) / nThreads;
  std::cout << "#threads: " << nThreads << " #blocks: " << nBlocks << std::endl;

  // alloc device output data
  checkCudaErrors(cudaMalloc(&d_odata, nBlocks * sizeof(int64_t)));

  // find the min of the array
  reduce_gpu(
      INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &minVal, minOp);
  reduce_cpu(h_idata, minOp);
  PRINT_VALUE("Min GPU: ", minVal);

  // find the max of the array - nbins
  reduce_gpu(
      INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &maxVal, maxOp);
  reduce_cpu(h_idata, maxOp);
  PRINT_VALUE("Max GPU: ", maxVal);

  // do bincount calc
  auto h_counts = (int64_t*)malloc((maxVal + 1) * sizeof(int64_t));
  bincount_init(
      INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, h_counts, minVal, maxVal);
  bincount_cpu(INPUT_DATA_SIZE, h_idata, h_counts, minVal, maxVal);
  free(h_counts);

  ////////////////////////////////////////////////////////////////////////
  // free device memory
  cudaFree(d_idata);
  cudaFree(d_odata);
  cudaDeviceReset();
  cudaDeviceSynchronize();
  return 0;
}

void randominit(int64_t* h_data) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> distr(
      UNIFORM_DISTRIBUTION_LOW, UNIFORM_DISTRIBUTION_HIGH);
  for (int i = 0; i < INPUT_DATA_SIZE; i++) {
    h_data[i] = static_cast<int64_t>(distr(gen));
  }
}

template <typename Op>
void reduce_cpu(const int64_t* h_idata, Op op) {
  int64_t output = h_idata[0];
  for (int i = 1; i < INPUT_DATA_SIZE; i++) {
    op(output, h_idata[i]);
  }
  PRINT_VALUE("XXX CPU: ", output);
}

void bincount_cpu(
    int64_t size,
    const int64_t* h_data,
    const int64_t* gpu_hist,
    int64_t min,
    int64_t max) {
  // cpu bincount implementation for verification
  auto sz = max + 1;
  int64_t* hist = (int64_t*)malloc(sz * sizeof(int64_t));
  for (int i = 0; i < sz; i++) {
    hist[i] = 0;
  }
  for (int i = 0; i < size; i++) {
    hist[h_data[i]] += 1;
  }
  PRINT_ARRAY("bincount CPU: ", hist, sz);
  ASSERT_EQUAL(gpu_hist, hist, sz);
  free(hist);
}
	#ifndef _HELPER_H_
	#define _HELPER_H_

	#include <cuda_runtime.h>

	#include <sys/types.h>
	#include <cmath>
	#include <functional>
	#include <iostream>
	#include <random>
	#include <vector>

	#define MIN(a, b) (((a) < (b)) ? (a) : (b))
	#define MAX(a, b) (((a) > (b)) ? (a) : (b))
	#define UMUL(a, b, c) (((a) * (b)) + (c))

	#define PRINT_ARRAY(name, v, sz) \
	{ \
	auto s = ((sz < 20) ? sz : 20); \
	std::cout << name; \
	for (int i = 0; i < s; i++) \
	std::cout << v[i] << ", "; \
	std::cout << ((s == sz) ? "" : "...") << std::endl; \
	}
	#define PRINT_VALUE(name, v) std::cout << name << v << std::endl;

	inline void
	check(cudaError_t code, const char* file, int line, bool abort = true) {
	if (code != cudaSuccess) {
	fprintf(
	stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
	if (abort)
	exit(code);
	}
	}

	// This will output the proper CUDA error strings in the event that a CUDA host
	// call returns an error
	#define checkCudaErrors(val) check((val), __FILE__, __LINE__)

	#endif
	#include "helper.h"
	#include "histogram.cuh"
	#include "reduce.cuh"

	__global__ void bincount_kernel(
	int64_t size,
	int64_t nbins,
	size_t pitch,
	const int64_t* d_idata,
	int64_t* d_counts) {
	//
	extern __shared__ unsigned char smem[];
	int* sHist = reinterpret_cast<int*>(smem);

	auto tid = threadIdx.x;
	auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x);

	// reset shared mem
	for (size_t i = tid; i < nbins; i += blockDim.x) {
	sHist[i] = 0;
	}
	__syncthreads();
	// inc counter
	if (idx < size) {
	int64_t val = d_idata[idx];
	atomicAdd(&sHist[val], 1);
	}
	__syncthreads();
	// move the bin values into global device memory d_counts
	for (size_t i = tid; i < nbins; i += blockDim.x) {
	int64_t* d_row = (int64_t)((char)d_counts + i * pitch);
	d_row[blockIdx.x] = static_cast<int64_t>(sHist[i]);
	}
	}

	void bincount_gpu(
	int64_t size,
	int64_t nThreads,
	int64_t nBlocks,
	int64_t nbins,
	const int64_t* d_idata,
	int64_t* d_counts,
	int64_t min,
	int64_t max) {
	auto sumOp = [] __device__(int64_t & a, int64_t b) { a += b; };
	int64_t* d_temp2d;
	size_t pitch;
	// alloc 2d device memory
	checkCudaErrors(
	cudaMallocPitch(&d_temp2d, &pitch, nBlocks * sizeof(int64_t), nbins));
	// launch the bincount kernel
	int smemSize = nbins * sizeof(int64_t);
	bincount_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
	size, nbins, pitch, d_idata, d_temp2d);
	checkCudaErrors(cudaGetLastError());

	// reduce each bin
	int64_t dummy;
	int nth = (nBlocks < nThreads) ? nBlocks : nThreads;
	int nblk = (nBlocks + nth - 1) / nth;
	int64_t* d_row;
	for (int i = 0; i < nbins; i++) {
	d_row = (int64_t)((char)d_temp2d + i * pitch);
	reduce_gpu(nBlocks, nth, nblk, d_row, &d_counts[i], &dummy, sumOp, false);
	}
	// free 2d temp
	cudaFree(d_temp2d);
	}

	void bincount_init(
	int64_t size,
	int64_t nThreads,
	int64_t nBlocks,
	const int64_t* d_idata,
	int64_t* h_counts,
	int64_t min,
	int64_t max) {
	auto nbins = max + 1;
	auto histBytes = nbins * sizeof(int64_t);
	int64_t* d_counts;

	// alloc hist on device
	checkCudaErrors(cudaMalloc(&d_counts, histBytes));
	// kernel call
	bincount_gpu(size, nThreads, nBlocks, nbins, d_idata, d_counts, min, max);
	// copy result device to host
	checkCudaErrors(
	cudaMemcpy(h_counts, d_counts, histBytes, cudaMemcpyDeviceToHost));
	PRINT_ARRAY("bincount GPU: ", h_counts, nbins);
	cudaFree(d_counts);
	}
	#ifndef _HISTOGRAM_CUH_
	#define _HISTOGRAM_CUH_

	#include <sys/types.h>

	__global__ void bincount_kernel(
	int64_t size,
	int64_t nbins,
	size_t pitch,
	const int64_t* d_idata,
	int64_t* d_counts);

	void bincount_gpu(
	int64_t size,
	int64_t nThreads,
	int64_t nBlocks,
	int64_t nbins,
	const int64_t* d_idata,
	int64_t* d_counts,
	int64_t min,
	int64_t max);

	void bincount_init(
	int64_t size,
	int64_t nThreads,
	int64_t nBlocks,
	const int64_t* d_idata,
	int64_t* h_counts,
	int64_t min,
	int64_t max);

	#endif
	NVCC=/usr/local/cuda/bin/nvcc
	CCFLAGS=-std=c++11 --expt-extended-lambda -G -Xcompiler -rdynamic -m64 $(CFLAGS)
	ARCH=-gencode arch=compute_35,code=sm_35
	LIBS=-lcudart
	TARGET=run

	SOURCES = TestBincount.cu histogram.cu
	OBJECTS = $(SOURCES:%.cu=%.o)

	all: $(TARGET)

	%.o: %.cu
	$(NVCC) $(CCFLAGS) $(ARCH) -c $<

	$(TARGET): $(OBJECTS)
	$(NVCC) $(CCFLAGS) $(ARCH) -o $(TARGET) $^ $(LIBS)

	clean: $(OBJECTS)
	rm -f $^ $(TARGET)
	#ifndef _REDUCE_CUH_
	#define _REDUCE_CUH_

	#include <sys/types.h>

	#define LOWER_POW_2 \
	[](int a) { \
	int pow2; \
	pow2 = 1 << static_cast<int>(floor(log2((float)a))); \
	if (pow2 == a) \
	pow2 >>= 1; \
	return pow2; \
	}

	template <typename Op>
	__global__ void reduce_kernel(
	int size,
	const int64_t* d_idata,
	int64_t* d_odata,
	int pow2,
	Op op) {
	extern __shared__ unsigned char smem[];
	int64_t* sm = reinterpret_cast<int64_t*>(smem);

	auto tid = threadIdx.x;
	auto idx = UMUL(blockDim.x, blockIdx.x, threadIdx.x);

	// copy data from global memory into shared memory
	sm[tid] = (idx < size) ? d_idata[idx] : -1; // use -1 to detect out of bounds
	__syncthreads();
	// perform reduction on shared memory in the block
	for (int s = pow2; s > 0; s >>= 1) {
	if (tid < s && (idx + s) < size) {
	op(sm[tid], sm[tid + s]);
	}
	__syncthreads();
	}
	if (tid == 0)
	d_odata[blockIdx.x] = sm[0];
	}

	template <typename Op>
	void reduce_gpu(
	int size,
	int nThreads,
	int nBlocks,
	const int64_t* d_idata,
	int64_t* d_odata,
	int64_t* val,
	Op op,
	bool copyDtoH = true) {
	int smemSize = (nThreads <= 32) ? 2 * nThreads * sizeof(int64_t)
	: nThreads * sizeof(int64_t);
	int pow2 = LOWER_POW_2(nThreads);
	reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
	size, d_idata, d_odata, pow2, op);
	checkCudaErrors(cudaGetLastError());

	while (nBlocks > 1) {
	size = nBlocks;
	nBlocks = (nBlocks + nThreads - 1) / nThreads;
	reduce_kernel<<<nBlocks, nThreads, smemSize, 0>>>(
	size, d_odata, d_odata, pow2, op);
	checkCudaErrors(cudaGetLastError());
	}
	if (copyDtoH)
	checkCudaErrors(
	cudaMemcpy(val, d_odata, sizeof(int64_t), cudaMemcpyDeviceToHost));
	}

	#endif
	// vanilla CUDA bincount implementation

	#include "helper.h"
	#include "histogram.cuh"
	#include "reduce.cuh"

	#ifndef INPUT_DATA_SIZE
	#define INPUT_DATA_SIZE 10
	#endif

	#ifndef NTHREADS
	#define NTHREADS 256
	#endif

	#define UNIFORM_DISTRIBUTION_HIGH 17
	#define UNIFORM_DISTRIBUTION_LOW 4

	#define ASSERT_EQUAL(a, b, n) \
	{ \
	bool f = true; \
	for (int i = 0; i < (n); i++) { \
	if ((a)[i] != (b)[i]) { \
	f = false; \
	break; \
	} \
	} \
	std::cout << "GPU" << ((f) ? " == " : " != ") << "CPU\n"; \
	}

	// forward declarations
	void randominit(int64_t* h_data);
	template <typename Op>
	void reduce_cpu(const int64_t* h_idata, Op op);
	void bincount_cpu(
	int64_t size,
	const int64_t* h_data,
	const int64_t* gpu_hist,
	int64_t min,
	int64_t max);

	int main(int argc, char** argv) {
	int64_t h_idata[INPUT_DATA_SIZE]{0};
	auto minOp = [] __host__ __device__(int64_t & a, int64_t b) {
	a = (a < b) ? a : b;
	};
	auto maxOp = [] __host__ __device__(int64_t & a, int64_t b) {
	a = (a > b) ? a : b;
	};
	////////////////////////////////////////////////////////////////////////
	// random init
	randominit(h_idata);
	// print input data
	PRINT_ARRAY("Input: ", h_idata, INPUT_DATA_SIZE);

	////////////////////////////////////////////////////////////////////////
	// alloc host input data and copy from host to device
	size_t size = INPUT_DATA_SIZE * sizeof(int64_t);
	int64_t d_idata, d_odata;
	checkCudaErrors(cudaMalloc(&d_idata, size));
	checkCudaErrors(cudaMemcpy(d_idata, h_idata, size, cudaMemcpyHostToDevice));

	////////////////////////////////////////////////////////////////////////
	// call kernel and main gpu computation logic
	int64_t minVal, maxVal;
	int nThreads = NTHREADS;
	int nBlocks = (INPUT_DATA_SIZE + nThreads - 1) / nThreads;
	std::cout << "#threads: " << nThreads << " #blocks: " << nBlocks << std::endl;

	// alloc device output data
	checkCudaErrors(cudaMalloc(&d_odata, nBlocks * sizeof(int64_t)));

	// find the min of the array
	reduce_gpu(
	INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &minVal, minOp);
	reduce_cpu(h_idata, minOp);
	PRINT_VALUE("Min GPU: ", minVal);

	// find the max of the array - nbins
	reduce_gpu(
	INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, d_odata, &maxVal, maxOp);
	reduce_cpu(h_idata, maxOp);
	PRINT_VALUE("Max GPU: ", maxVal);

	// do bincount calc
	auto h_counts = (int64_t)malloc((maxVal + 1) sizeof(int64_t));
	bincount_init(
	INPUT_DATA_SIZE, nThreads, nBlocks, d_idata, h_counts, minVal, maxVal);
	bincount_cpu(INPUT_DATA_SIZE, h_idata, h_counts, minVal, maxVal);
	free(h_counts);

	////////////////////////////////////////////////////////////////////////
	// free device memory
	cudaFree(d_idata);
	cudaFree(d_odata);
	cudaDeviceReset();
	cudaDeviceSynchronize();
	return 0;
	}

	void randominit(int64_t* h_data) {
	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_int_distribution<> distr(
	UNIFORM_DISTRIBUTION_LOW, UNIFORM_DISTRIBUTION_HIGH);
	for (int i = 0; i < INPUT_DATA_SIZE; i++) {
	h_data[i] = static_cast<int64_t>(distr(gen));
	}
	}

	template <typename Op>
	void reduce_cpu(const int64_t* h_idata, Op op) {
	int64_t output = h_idata[0];
	for (int i = 1; i < INPUT_DATA_SIZE; i++) {
	op(output, h_idata[i]);
	}
	PRINT_VALUE("XXX CPU: ", output);
	}

	void bincount_cpu(
	int64_t size,
	const int64_t* h_data,
	const int64_t* gpu_hist,
	int64_t min,
	int64_t max) {
	// cpu bincount implementation for verification
	auto sz = max + 1;
	int64_t* hist = (int64_t)malloc(sz sizeof(int64_t));
	for (int i = 0; i < sz; i++) {
	hist[i] = 0;
	}
	for (int i = 0; i < size; i++) {
	hist[h_data[i]] += 1;
	}
	PRINT_ARRAY("bincount CPU: ", hist, sz);
	ASSERT_EQUAL(gpu_hist, hist, sz);
	free(hist);
	}