Created
May 24, 2013 09:35
-
-
Save anonymous/5642406 to your computer and use it in GitHub Desktop.
Here's my CUDA code.
It just makes a copy of a matrix. Both a serial and parallel version are written so that I can compare them. It gives wrong answer if the array size is 64. For 256 and beyond, it gives correct answer. However it takes too long, 4 seconds for a 512x512 matrix. I am not comfortable with cudaMemcpy2D. Can someone please pinpoin…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<stdio.h> | |
#include<assert.h> | |
void verify(float * A, float * B, int size); | |
__global__ void CopyData(float *d_array, float* d_dest_array, size_t pitch, int cols, int rows) | |
{ | |
for(int i=0; i<rows; i++){ | |
float *rowData = (float*)(((char*)d_array) + (i*pitch)); | |
for(int j=0; j<cols; j++){ | |
d_dest_array[i*cols+j] = *(rowData+j); | |
} | |
} | |
} | |
int main(int argc, char **argv) | |
{ | |
int row, col, i, j; | |
float time1, time2; | |
float *d_array; // dev arr which mem will be alloc to | |
float *d_dest_array; // dev arr that will be a copy | |
size_t pitch; // ensures correct data struc alignm | |
if(argc != 3) | |
{ | |
printf("Usage: %s [row] [col]\n", argv[0]); | |
return 1; | |
} | |
row = atoi(argv[1]); | |
col = atoi(argv[2]); | |
float *h1_array = new float[col*row]; | |
float *h2_array = new float[col*row]; | |
float *h_ori_array = new float[col*row]; | |
for (i = 0; i<row; i++){ | |
for(j = 0; j<col; j++){ | |
h_ori_array[i*col+j] = i*col + j; | |
} | |
} | |
cudaEvent_t start, stop; | |
cudaMallocPitch(&d_array, &pitch, col*sizeof(float), row); | |
cudaMalloc(&d_dest_array, col*row*sizeof(float)); | |
cudaMemcpy2D(d_array, pitch, h_ori_array, col*sizeof(float), col*sizeof(float), row, cudaMemcpyHostToDevice); | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start, 0); | |
//CopyData<<<100, 512>>>(d_array, d_dest_array, pitch, col, row); | |
for (i = 0; i<row; i++){ | |
for(j = 0; j<col; j++){ | |
h1_array[i*col+j] = h_ori_array[i*col+j]; | |
} | |
} | |
cudaEventRecord(stop, 0); | |
cudaEventSynchronize(stop); | |
cudaEventElapsedTime(&time1, start, stop); | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start, 0); | |
CopyData<<<row*col/512, 512>>>(d_array, d_dest_array, pitch, col, row); | |
cudaEventRecord(stop, 0); | |
cudaEventSynchronize(stop); | |
cudaEventElapsedTime(&time2, start, stop); | |
cudaMemcpy2D(h2_array, pitch, d_dest_array, col*sizeof(float), col*sizeof(float), row, cudaMemcpyDeviceToHost); | |
verify(h1_array, h2_array, row*col); | |
free(h1_array); free(h2_array); free(h_ori_array); | |
cudaFree(d_array); cudaFree(d_dest_array); | |
printf("Exec time in ser = %f, par = %f ms with pitch %d", time1, time2, (int)pitch); | |
return 0; | |
} | |
void verify(float * A, float * B, int size) | |
{ | |
for (int i = 0; i < size; i++) | |
{ | |
assert(A[i]==B[i]); | |
} | |
printf("Correct!"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment