Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active January 30, 2021 04:35
Show Gist options
  • Save sandeepkumar-skb/abed94574cfefe83fcedbb530c14a2a7 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/abed94574cfefe83fcedbb530c14a2a7 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 256
#define GRID_SIZE 72 //Turing Titan RTX
#define OUT_SIZE 256
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
__global__
void histo_d(float* img, int height, int width, unsigned int *out, int out_size){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x*blockDim.x;
__shared__ unsigned int shmem[OUT_SIZE];
for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){
shmem[i] = 0;
}
__syncthreads();
for (int i=idx; i < height*width; i+=stride){
int bucket = static_cast<int>(img[i]) % OUT_SIZE;
atomicAdd(&shmem[bucket], 1);
}
__syncthreads();
for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){
atomicAdd(&out[i], shmem[i]);
}
}
void histo_h(float* img, int height, int width, unsigned int *out, int out_size){
for (int i=0; i < height*width; ++i){
int bucket = static_cast<int>(img[i]) % OUT_SIZE;
out[bucket]++;
}
}
int main(){
float *img;
unsigned int *out;
unsigned int *out_h;
int out_size = OUT_SIZE;
int height = 1024;
int width = 1024;
gpuErrchk(cudaMallocManaged(&img, height*width*sizeof(float)));
out_h = (unsigned int*) malloc(out_size*sizeof(unsigned int));
gpuErrchk(cudaMallocManaged(&out, out_size*sizeof(unsigned int)));
for (int i=0; i < height*width; ++i){
img[i] = i;
}
for (int i=0; i < out_size; ++i){
out[i] = 0;
out_h[i] = 0;
}
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start));
histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
std::chrono::high_resolution_clock::time_point ch_start;
std::chrono::high_resolution_clock::time_point ch_end ;
std::chrono::duration<double> span;
ch_start = std::chrono::high_resolution_clock::now();
histo_h(img, height, width, out_h, out_size);
ch_end = std::chrono::high_resolution_clock::now();
span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);
for(int i=0; i < out_size; ++i){
if (out[i] != out_h[i]){
std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n";
}
}
printf("GPU Effective time: %f ms\n", milliseconds);
std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
cudaFree(img);
cudaFree(out);
free(out_h);
}
@sandeepkumar-skb
Copy link
Author

Compile & Run:

GPU Effective time: 2.859360 ms
CPU Time: 3.61055ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment