Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Created January 30, 2021 04:32
Show Gist options
  • Save sandeepkumar-skb/451bd328fdc15ad4599dbb2010ae44bd to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/451bd328fdc15ad4599dbb2010ae44bd to your computer and use it in GitHub Desktop.
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 32
#define NUM_REPS 100
inline void gpuAssert(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess){
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
__global__
void gpu_transpose_global(int* d_in, int* d_out, int M, int N){
int col = blockIdx.x*blockDim.x + threadIdx.x;
int row = blockIdx.y*blockDim.y + threadIdx.y;
d_out[col*N + row] = d_in[row*N + col];
}
__global__
void gpu_transpose_shared(int* d_in, int* d_out, int M, int N){
__shared__ int shmem[BLOCK_SIZE][BLOCK_SIZE];
int col = blockIdx.x*blockDim.x + threadIdx.x;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int tx=threadIdx.x, ty=threadIdx.y;
shmem[ty][tx] = d_in[row*N + col];
__syncthreads();
col = blockDim.x*blockIdx.y + threadIdx.x;
row = blockDim.y*blockIdx.x + threadIdx.y;
d_out[row*N + col] = shmem[tx][ty];
}
__global__
void gpu_transpose_shared_no_bankconflict(int* d_in, int* d_out, int M, int N){
__shared__ int shmem[BLOCK_SIZE][BLOCK_SIZE+1];
int col = blockIdx.x*blockDim.x + threadIdx.x;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int tx=threadIdx.x, ty=threadIdx.y;
shmem[ty][tx] = d_in[row*N + col];
__syncthreads();
col = blockDim.x*blockIdx.y + threadIdx.x;
row = blockDim.y*blockIdx.x + threadIdx.y;
d_out[row*N + col] = shmem[tx][ty];
}
void golden_model(int* input, int* output, int M, int N){
for (int i=0; i<M; ++i){
for(int j=0; j<N; ++j){
output[N*j + i] = input[N*i + j];
}
}
}
void postprocess(int *golden, int* gpu_out, int M, int N, float ms){
for(int i=0; i<M*N; ++i){
if (gpu_out[i] != golden[i]){
printf("Mismatch: gpu: %d, cpu: %d, idx: %d \n", gpu_out[i], golden[i], i);
break;
}
}
printf("Effective time: %.3f ms\n", ms/NUM_REPS);
float bytes = sizeof(int)* 2 * (float)M * (float)N;
printf("Effective Bandwidth: %.3f Gbps\n", bytes * 1e-6* NUM_REPS /ms);
}
int main(){
int M = 1024;
int N = 1024;
int *d_in, *d_out, *shd_out, *h_out;
gpuErrchk(cudaMallocManaged(&d_in, M*N*sizeof(int)));
gpuErrchk(cudaMallocManaged(&d_out, M*N*sizeof(int)));
gpuErrchk(cudaMallocManaged(&shd_out, M*N*sizeof(int)));
gpuErrchk(cudaMallocManaged(&h_out, M*N*sizeof(int)));
//Initialize
for (int i=0; i<M*N; ++i){
d_in[i]=i;
d_out[i] = 0;
shd_out[i] = 0;
h_out[i] = 0;
}
golden_model(d_in, h_out, M, N);
dim3 blockSize (BLOCK_SIZE, BLOCK_SIZE,1);
dim3 gridSize ((N-1)/BLOCK_SIZE +1,(M-1)/BLOCK_SIZE + 1,1);
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
/************ TRANSPOSE USING GLOBAL MEM **************/
// warmup
gpu_transpose_global<<<gridSize, blockSize>>>(d_in, d_out, M, N);
gpuErrchk(cudaEventRecord(start));
for (int i=0; i < NUM_REPS; ++i)
gpu_transpose_global<<<gridSize, blockSize>>>(d_in, d_out, M, N);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
float milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Transpose using Global Memory\n");
postprocess(h_out, d_out, M, N, milliseconds);
printf("\n");
/************ TRANSPOSE USING SHARED MEM **************/
// warmup
gpu_transpose_shared<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
gpuErrchk(cudaEventRecord(start));
for (int i=0; i < NUM_REPS; ++i)
gpu_transpose_shared<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Transpose using Shared Memory\n");
postprocess(h_out, shd_out, M, N, milliseconds);
printf("\n");
/************ TRANSPOSE USING SHARED MEM & NO BANK CONFLICT**************/
// warmup
gpu_transpose_shared_no_bankconflict<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
gpuErrchk(cudaEventRecord(start));
for (int i=0; i < NUM_REPS; ++i)
gpu_transpose_shared_no_bankconflict<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
gpuErrchk(cudaEventRecord(stop));
gpuErrchk(cudaEventSynchronize(stop));
milliseconds = 0;
gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Transpose using Shared Memory and No bank conflict\n");
postprocess(h_out, shd_out, M, N, milliseconds);
printf("\n");
cudaFree(d_in);
cudaFree(d_out);
cudaFree(shd_out);
cudaFree(h_out);
return 0;
}
@sandeepkumar-skb
Copy link
Author

sandeepkumar-skb commented Jan 30, 2021

Compile Cmd: nvcc -std=c++11 transpose.cu -o transpose.o && ./transpose.o
Results

Transpose using Global Memory
Effective time: 0.054 ms
Effective Bandwidth: 155.923 Gbps

Transpose using Shared Memory
Effective time: 0.037 ms
Effective Bandwidth: 226.341 Gbps

Transpose using Shared Memory and No bank conflict
Effective time: 0.019 ms
Effective Bandwidth: 449.778 Gbps

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment