/aa.cu

## aa.cu
#include<stdio.h>
#include<assert.h>
void verify(float * A, float * B, int size);

__global__ void CopyData(float *d_array, float* d_dest_array, size_t pitch, int cols, int rows)
{
  for(int i=0; i<rows; i++){
		float *rowData = (float*)(((char*)d_array) + (i*pitch));
		for(int j=0; j<cols; j++){
			d_dest_array[i*cols+j] = *(rowData+j);
		}
	}
}

int main(int argc, char **argv)
{
	int row, col, i, j;
	float time1, time2;
	float *d_array;					// dev arr which mem will be alloc to
	float *d_dest_array;		// dev arr that will be a copy
	size_t pitch;						// ensures correct data struc alignm
	if(argc != 3)
  {
		printf("Usage: %s [row] [col]\n", argv[0]);
		return 1;
  }

	row = atoi(argv[1]);
	col = atoi(argv[2]);
	float *h1_array = new float[col*row];
	float *h2_array = new float[col*row];
	float *h_ori_array = new float[col*row];
	for (i = 0; i<row; i++){
		for(j = 0; j<col; j++){
			h_ori_array[i*col+j] = i*col + j;
		}
	}
	cudaEvent_t start, stop;

	cudaMallocPitch(&d_array, &pitch, col*sizeof(float), row);
	cudaMalloc(&d_dest_array, col*row*sizeof(float));
	cudaMemcpy2D(d_array, pitch, h_ori_array, col*sizeof(float), col*sizeof(float), row, cudaMemcpyHostToDevice);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	//CopyData<<<100, 512>>>(d_array, d_dest_array, pitch, col, row);
	for (i = 0; i<row; i++){
		for(j = 0; j<col; j++){
			h1_array[i*col+j] = h_ori_array[i*col+j];
		}
	}
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time1, start, stop);

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	CopyData<<<row*col/512, 512>>>(d_array, d_dest_array, pitch, col, row);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time2, start, stop);

	cudaMemcpy2D(h2_array, pitch, d_dest_array, col*sizeof(float), col*sizeof(float), row, cudaMemcpyDeviceToHost);

	verify(h1_array, h2_array, row*col);

	free(h1_array); free(h2_array); free(h_ori_array);
  cudaFree(d_array); cudaFree(d_dest_array);
	printf("Exec time in ser = %f, par = %f ms with pitch %d", time1, time2, (int)pitch);

	return 0;
}

void verify(float * A, float * B, int size)
{
 	for (int i = 0; i < size; i++)
	{
	 	assert(A[i]==B[i]);
	}
	 printf("Correct!");
}
	#include<stdio.h>
	#include<assert.h>
	void verify(float * A, float * B, int size);

	__global__ void CopyData(float d_array, float d_dest_array, size_t pitch, int cols, int rows)
	{
	for(int i=0; i<rows; i++){
	float rowData = (float)(((char)d_array) + (ipitch));
	for(int j=0; j<cols; j++){
	d_dest_array[icols+j] = (rowData+j);
	}
	}
	}

	int main(int argc, char **argv)
	{
	int row, col, i, j;
	float time1, time2;
	float *d_array; // dev arr which mem will be alloc to
	float *d_dest_array; // dev arr that will be a copy
	size_t pitch; // ensures correct data struc alignm
	if(argc != 3)
	{
	printf("Usage: %s [row] [col]\n", argv[0]);
	return 1;
	}

	row = atoi(argv[1]);
	col = atoi(argv[2]);
	float h1_array = new float[colrow];
	float h2_array = new float[colrow];
	float h_ori_array = new float[colrow];
	for (i = 0; i<row; i++){
	for(j = 0; j<col; j++){
	h_ori_array[icol+j] = icol + j;
	}
	}
	cudaEvent_t start, stop;

	cudaMallocPitch(&d_array, &pitch, col*sizeof(float), row);
	cudaMalloc(&d_dest_array, colrowsizeof(float));
	cudaMemcpy2D(d_array, pitch, h_ori_array, colsizeof(float), colsizeof(float), row, cudaMemcpyHostToDevice);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	//CopyData<<<100, 512>>>(d_array, d_dest_array, pitch, col, row);
	for (i = 0; i<row; i++){
	for(j = 0; j<col; j++){
	h1_array[icol+j] = h_ori_array[icol+j];
	}
	}
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time1, start, stop);

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	CopyData<<<row*col/512, 512>>>(d_array, d_dest_array, pitch, col, row);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time2, start, stop);

	cudaMemcpy2D(h2_array, pitch, d_dest_array, colsizeof(float), colsizeof(float), row, cudaMemcpyDeviceToHost);

	verify(h1_array, h2_array, row*col);

	free(h1_array); free(h2_array); free(h_ori_array);
	cudaFree(d_array); cudaFree(d_dest_array);
	printf("Exec time in ser = %f, par = %f ms with pitch %d", time1, time2, (int)pitch);

	return 0;
	}

	void verify(float * A, float * B, int size)
	{
	for (int i = 0; i < size; i++)
	{
	assert(A[i]==B[i]);
	}
	printf("Correct!");
	}