sandeepkumar-skb/transpose.cu

## transpose.cu
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 32
#define NUM_REPS 100

inline void gpuAssert(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess){
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
#define gpuErrchk(ans)                        \
{                                         \
    gpuAssert((ans), __FILE__, __LINE__); \
}


__global__
void gpu_transpose_global(int* d_in, int* d_out, int M, int N){
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    d_out[col*N + row] = d_in[row*N + col];
}

__global__
void gpu_transpose_shared(int* d_in, int* d_out, int M, int N){
    __shared__ int shmem[BLOCK_SIZE][BLOCK_SIZE];
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int tx=threadIdx.x, ty=threadIdx.y;
    shmem[ty][tx] = d_in[row*N + col];
    __syncthreads();
    col = blockDim.x*blockIdx.y + threadIdx.x;
    row = blockDim.y*blockIdx.x + threadIdx.y;
    d_out[row*N + col] = shmem[tx][ty];
}

__global__
void gpu_transpose_shared_no_bankconflict(int* d_in, int* d_out, int M, int N){
    __shared__ int shmem[BLOCK_SIZE][BLOCK_SIZE+1];
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int tx=threadIdx.x, ty=threadIdx.y;
    shmem[ty][tx] = d_in[row*N + col];
    __syncthreads();
    col = blockDim.x*blockIdx.y + threadIdx.x;
    row = blockDim.y*blockIdx.x + threadIdx.y;
    d_out[row*N + col] = shmem[tx][ty];
}

void golden_model(int* input, int* output, int M, int N){
    for (int i=0; i<M; ++i){
        for(int j=0; j<N; ++j){
            output[N*j + i] = input[N*i + j];
        }
    }
}

void postprocess(int *golden, int* gpu_out, int M, int N, float ms){
    for(int i=0; i<M*N; ++i){
        if (gpu_out[i] != golden[i]){
            printf("Mismatch: gpu: %d, cpu: %d, idx: %d \n", gpu_out[i], golden[i], i);
            break;
        }
    }
    printf("Effective time: %.3f ms\n", ms/NUM_REPS);
    float bytes = sizeof(int)* 2 * (float)M * (float)N;
    printf("Effective Bandwidth: %.3f Gbps\n", bytes * 1e-6* NUM_REPS /ms);
}

int main(){
    int M = 1024;
    int N = 1024;
    int *d_in, *d_out, *shd_out, *h_out;
    gpuErrchk(cudaMallocManaged(&d_in, M*N*sizeof(int)));
    gpuErrchk(cudaMallocManaged(&d_out, M*N*sizeof(int)));
    gpuErrchk(cudaMallocManaged(&shd_out, M*N*sizeof(int)));
    gpuErrchk(cudaMallocManaged(&h_out, M*N*sizeof(int)));

    //Initialize
    for (int i=0; i<M*N; ++i){
        d_in[i]=i;
        d_out[i] = 0;
        shd_out[i] = 0;
        h_out[i] = 0;
    }

    golden_model(d_in, h_out, M, N);
    dim3 blockSize (BLOCK_SIZE, BLOCK_SIZE,1);
    dim3 gridSize ((N-1)/BLOCK_SIZE +1,(M-1)/BLOCK_SIZE + 1,1);

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    /************ TRANSPOSE USING GLOBAL MEM **************/
    // warmup
    gpu_transpose_global<<<gridSize, blockSize>>>(d_in, d_out, M, N);
    gpuErrchk(cudaEventRecord(start));
    for (int i=0; i < NUM_REPS; ++i)
        gpu_transpose_global<<<gridSize, blockSize>>>(d_in, d_out, M, N);
    gpuErrchk(cudaEventRecord(stop));
    gpuErrchk(cudaEventSynchronize(stop));

    float milliseconds = 0;
    gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
    printf("Transpose using Global Memory\n");
    postprocess(h_out, d_out, M, N, milliseconds);
    printf("\n");

    /************ TRANSPOSE USING SHARED MEM **************/
    // warmup
    gpu_transpose_shared<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
    gpuErrchk(cudaEventRecord(start));
    for (int i=0; i < NUM_REPS; ++i)
        gpu_transpose_shared<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
    gpuErrchk(cudaEventRecord(stop));
    gpuErrchk(cudaEventSynchronize(stop));

    milliseconds = 0;
    gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
    printf("Transpose using Shared Memory\n");
    postprocess(h_out, shd_out, M, N, milliseconds);
    printf("\n");

    /************ TRANSPOSE USING SHARED MEM & NO BANK CONFLICT**************/
    // warmup
    gpu_transpose_shared_no_bankconflict<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
    gpuErrchk(cudaEventRecord(start));
    for (int i=0; i < NUM_REPS; ++i)
        gpu_transpose_shared_no_bankconflict<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
    gpuErrchk(cudaEventRecord(stop));
    gpuErrchk(cudaEventSynchronize(stop));

    milliseconds = 0;
    gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
    printf("Transpose using Shared Memory and No bank conflict\n");
    postprocess(h_out, shd_out, M, N, milliseconds);
    printf("\n");

    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(shd_out);
    cudaFree(h_out);
    return 0;
}
	#include <cuda.h>
	#include <stdio.h>
	#define BLOCK_SIZE 32
	#define NUM_REPS 100

	inline void gpuAssert(cudaError_t err, const char *file, int line)
	{
	if (err != cudaSuccess){
	printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
	exit(EXIT_FAILURE);
	}
	}
	#define gpuErrchk(ans) \
	{ \
	gpuAssert((ans), __FILE__, __LINE__); \
	}


	__global__
	void gpu_transpose_global(int* d_in, int* d_out, int M, int N){
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	d_out[colN + row] = d_in[rowN + col];
	}

	__global__
	void gpu_transpose_shared(int* d_in, int* d_out, int M, int N){
	__shared__ int shmem[BLOCK_SIZE][BLOCK_SIZE];
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	int tx=threadIdx.x, ty=threadIdx.y;
	shmem[ty][tx] = d_in[row*N + col];
	__syncthreads();
	col = blockDim.x*blockIdx.y + threadIdx.x;
	row = blockDim.y*blockIdx.x + threadIdx.y;
	d_out[row*N + col] = shmem[tx][ty];
	}

	__global__
	void gpu_transpose_shared_no_bankconflict(int* d_in, int* d_out, int M, int N){
	__shared__ int shmem[BLOCK_SIZE][BLOCK_SIZE+1];
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	int tx=threadIdx.x, ty=threadIdx.y;
	shmem[ty][tx] = d_in[row*N + col];
	__syncthreads();
	col = blockDim.x*blockIdx.y + threadIdx.x;
	row = blockDim.y*blockIdx.x + threadIdx.y;
	d_out[row*N + col] = shmem[tx][ty];
	}

	void golden_model(int* input, int* output, int M, int N){
	for (int i=0; i<M; ++i){
	for(int j=0; j<N; ++j){
	output[Nj + i] = input[Ni + j];
	}
	}
	}

	void postprocess(int golden, int gpu_out, int M, int N, float ms){
	for(int i=0; i<M*N; ++i){
	if (gpu_out[i] != golden[i]){
	printf("Mismatch: gpu: %d, cpu: %d, idx: %d \n", gpu_out[i], golden[i], i);
	break;
	}
	}
	printf("Effective time: %.3f ms\n", ms/NUM_REPS);
	float bytes = sizeof(int)* 2 * (float)M * (float)N;
	printf("Effective Bandwidth: %.3f Gbps\n", bytes * 1e-6* NUM_REPS /ms);
	}

	int main(){
	int M = 1024;
	int N = 1024;
	int d_in, d_out, shd_out, h_out;
	gpuErrchk(cudaMallocManaged(&d_in, MNsizeof(int)));
	gpuErrchk(cudaMallocManaged(&d_out, MNsizeof(int)));
	gpuErrchk(cudaMallocManaged(&shd_out, MNsizeof(int)));
	gpuErrchk(cudaMallocManaged(&h_out, MNsizeof(int)));

	//Initialize
	for (int i=0; i<M*N; ++i){
	d_in[i]=i;
	d_out[i] = 0;
	shd_out[i] = 0;
	h_out[i] = 0;
	}

	golden_model(d_in, h_out, M, N);
	dim3 blockSize (BLOCK_SIZE, BLOCK_SIZE,1);
	dim3 gridSize ((N-1)/BLOCK_SIZE +1,(M-1)/BLOCK_SIZE + 1,1);

	cudaEvent_t start, stop;
	gpuErrchk(cudaEventCreate(&start));
	gpuErrchk(cudaEventCreate(&stop));

	/********** TRANSPOSE USING GLOBAL MEM ************/
	// warmup
	gpu_transpose_global<<<gridSize, blockSize>>>(d_in, d_out, M, N);
	gpuErrchk(cudaEventRecord(start));
	for (int i=0; i < NUM_REPS; ++i)
	gpu_transpose_global<<<gridSize, blockSize>>>(d_in, d_out, M, N);
	gpuErrchk(cudaEventRecord(stop));
	gpuErrchk(cudaEventSynchronize(stop));

	float milliseconds = 0;
	gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
	printf("Transpose using Global Memory\n");
	postprocess(h_out, d_out, M, N, milliseconds);
	printf("\n");

	/********** TRANSPOSE USING SHARED MEM ************/
	// warmup
	gpu_transpose_shared<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
	gpuErrchk(cudaEventRecord(start));
	for (int i=0; i < NUM_REPS; ++i)
	gpu_transpose_shared<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
	gpuErrchk(cudaEventRecord(stop));
	gpuErrchk(cudaEventSynchronize(stop));

	milliseconds = 0;
	gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
	printf("Transpose using Shared Memory\n");
	postprocess(h_out, shd_out, M, N, milliseconds);
	printf("\n");

	/********** TRANSPOSE USING SHARED MEM & NO BANK CONFLICT************/
	// warmup
	gpu_transpose_shared_no_bankconflict<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
	gpuErrchk(cudaEventRecord(start));
	for (int i=0; i < NUM_REPS; ++i)
	gpu_transpose_shared_no_bankconflict<<<gridSize, blockSize>>>(d_in, shd_out, M, N);
	gpuErrchk(cudaEventRecord(stop));
	gpuErrchk(cudaEventSynchronize(stop));

	milliseconds = 0;
	gpuErrchk(cudaEventElapsedTime(&milliseconds, start, stop));
	printf("Transpose using Shared Memory and No bank conflict\n");
	postprocess(h_out, shd_out, M, N, milliseconds);
	printf("\n");

	cudaFree(d_in);
	cudaFree(d_out);
	cudaFree(shd_out);
	cudaFree(h_out);
	return 0;
	}