sandeepkumar-skb/histogram_shmem.cu

## histogram_shmem.cu
#include <stdio.h>
#include <iostream>
#include <chrono>

#define BLOCK_SIZE 256
#define GRID_SIZE 72 //Turing Titan RTX
#define OUT_SIZE  256

inline void gpuAssert(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess){
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
#define gpuErrchk(ans)                        \
{                                         \
    gpuAssert((ans), __FILE__, __LINE__); \
}


__global__
void histo_d(float* img, int height, int width, unsigned int *out, int out_size){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x*blockDim.x;
    __shared__ unsigned int shmem[OUT_SIZE];

    for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){
        shmem[i] = 0;
    }
    __syncthreads();
    for (int i=idx; i < height*width; i+=stride){
        int bucket = static_cast<int>(img[i]) % OUT_SIZE;
        atomicAdd(&shmem[bucket], 1);
    }
    __syncthreads();

    for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){
        atomicAdd(&out[i], shmem[i]);
    }

}

void histo_h(float* img, int height, int width, unsigned int *out, int out_size){
    for (int i=0; i < height*width; ++i){
        int bucket = static_cast<int>(img[i]) % OUT_SIZE;
        out[bucket]++;
    }
}

int main(){
    float *img;
    unsigned int *out;
    unsigned int *out_h;
    int out_size = OUT_SIZE;
    int height = 1024;
    int width = 1024;

    gpuErrchk(cudaMallocManaged(&img, height*width*sizeof(float)));

    out_h = (unsigned int*) malloc(out_size*sizeof(unsigned int));
    gpuErrchk(cudaMallocManaged(&out, out_size*sizeof(unsigned int)));
    for (int i=0; i < height*width; ++i){
        img[i] = i;
    }

    for (int i=0; i < out_size; ++i){
        out[i] = 0;
        out_h[i] = 0;
    }

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start));
    histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size);
    gpuErrchk(cudaEventRecord(stop));

    gpuErrchk(cudaEventSynchronize(stop));

    float milliseconds = 0;
    gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));

    std::chrono::high_resolution_clock::time_point ch_start;
    std::chrono::high_resolution_clock::time_point ch_end ;
    std::chrono::duration<double> span;
    ch_start = std::chrono::high_resolution_clock::now();
    histo_h(img, height, width, out_h, out_size);
    ch_end = std::chrono::high_resolution_clock::now();
    span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);

    for(int i=0; i < out_size; ++i){
        if (out[i] != out_h[i]){
            std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n";
        }
    }

    printf("GPU Effective time: %f ms\n", milliseconds);
    std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
    cudaFree(img);
    cudaFree(out);
    free(out_h);


}
	#include <stdio.h>
	#include <iostream>
	#include <chrono>

	#define BLOCK_SIZE 256
	#define GRID_SIZE 72 //Turing Titan RTX
	#define OUT_SIZE 256

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
	exit(EXIT_FAILURE);
	}
	}
	#define gpuErrchk(ans) \
	{ \
	gpuAssert((ans), __FILE__, __LINE__); \
	}


	__global__
	void histo_d(float* img, int height, int width, unsigned int *out, int out_size){
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = gridDim.x*blockDim.x;
	__shared__ unsigned int shmem[OUT_SIZE];

	for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){
	shmem[i] = 0;
	}
	__syncthreads();
	for (int i=idx; i < height*width; i+=stride){
	int bucket = static_cast<int>(img[i]) % OUT_SIZE;
	atomicAdd(&shmem[bucket], 1);
	}
	__syncthreads();

	for (int i=threadIdx.x; i < OUT_SIZE; i+=BLOCK_SIZE){
	atomicAdd(&out[i], shmem[i]);
	}

	}

	void histo_h(float* img, int height, int width, unsigned int *out, int out_size){
	for (int i=0; i < height*width; ++i){
	int bucket = static_cast<int>(img[i]) % OUT_SIZE;
	out[bucket]++;
	}
	}

	int main(){
	float *img;
	unsigned int *out;
	unsigned int *out_h;
	int out_size = OUT_SIZE;
	int height = 1024;
	int width = 1024;

	gpuErrchk(cudaMallocManaged(&img, heightwidthsizeof(float)));

	out_h = (unsigned int) malloc(out_sizesizeof(unsigned int));
	gpuErrchk(cudaMallocManaged(&out, out_size*sizeof(unsigned int)));
	for (int i=0; i < height*width; ++i){
	img[i] = i;
	}

	for (int i=0; i < out_size; ++i){
	out[i] = 0;
	out_h[i] = 0;
	}

	cudaEvent_t start, stop;
	gpuErrchk(cudaEventCreate(&start));
	gpuErrchk(cudaEventCreate(&stop));

	gpuErrchk(cudaEventRecord(start));
	histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size);
	gpuErrchk(cudaEventRecord(stop));

	gpuErrchk(cudaEventSynchronize(stop));

	float milliseconds = 0;
	gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));

	std::chrono::high_resolution_clock::time_point ch_start;
	std::chrono::high_resolution_clock::time_point ch_end ;
	std::chrono::duration<double> span;
	ch_start = std::chrono::high_resolution_clock::now();
	histo_h(img, height, width, out_h, out_size);
	ch_end = std::chrono::high_resolution_clock::now();
	span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);

	for(int i=0; i < out_size; ++i){
	if (out[i] != out_h[i]){
	std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n";
	}
	}

	printf("GPU Effective time: %f ms\n", milliseconds);
	std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
	cudaFree(img);
	cudaFree(out);
	free(out_h);


	}