jhjin/helloworld.cu

## helloworld.cu
#include <stdio.h>
#include <stdlib.h>


// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

// Use 1024 threads per block, which requires cuda sm_2x or above
const int CUDA_NUM_THREADS = 1024;

// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}


__global__ void yourkernel(const int n, const float *x, float *y) {
   CUDA_KERNEL_LOOP(index, n) {
      y[index] = x[index] + 10.0f;
   }
}


int main(int argc, char *argv[]) {

   long channel = 2;
   long height = 3;
   long width = 4;
   long length = channel*height*width;

   // assign initial values
   float *input = new float[length];
   float *output = new float[length];
   for (int i = 0; i < length; i++)
      input[i] = i;

   // memcpy host to device
   float *input_cuda;
   float *output_cuda;
   cudaMalloc((void**)&input_cuda, length*sizeof(float));
   cudaMalloc((void**)&output_cuda, length*sizeof(float));
   cudaMemcpy(input_cuda, input, length*sizeof(float), cudaMemcpyHostToDevice);
   cudaMemcpy(output_cuda, output, length*sizeof(float), cudaMemcpyHostToDevice);


   // launch cuda kernel
   yourkernel <<< GET_BLOCKS(length), CUDA_NUM_THREADS >>> (length, input_cuda, output_cuda);


   // memcpy device to host
   cudaMemcpy(input, input_cuda, length*sizeof(float), cudaMemcpyDeviceToHost);
   cudaMemcpy(output, output_cuda, length*sizeof(float), cudaMemcpyDeviceToHost);


   // print input
   printf("==> input\n");
   for (int i = 0; i < channel; i++) {
      for (int j = 0; j < height; j++) {
         for (int k = 0; k < width; k++) {
            printf("%5.2f\t", input[i*height*width + j*width + k]);
	 }
	 printf("\n");
      }
      printf("\n");
   }

   // print output
   printf("==> output\n");
   for (int i = 0; i < channel; i++) {
      for (int j = 0; j < height; j++) {
         for (int k = 0; k < width; k++) {
            printf("%5.2f\t", output[i*height*width + j*width + k]);
	 }
	 printf("\n");
      }
      printf("\n");
   }


   // check last cuda error
   cudaError errcode = cudaGetLastError();
   if (errcode != cudaSuccess)
      printf("%s\n", cudaGetErrorString(errcode));

   // free all
   cudaFree(input_cuda);
   cudaFree(output_cuda);
   delete[] input;
   delete[] output;
   return 0;
}
	#include <stdio.h>
	#include <stdlib.h>


	// CUDA: grid stride looping
	#define CUDA_KERNEL_LOOP(i, n) \
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	const int CUDA_NUM_THREADS = 1024;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N) {
	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
	}


	__global__ void yourkernel(const int n, const float x, float y) {
	CUDA_KERNEL_LOOP(index, n) {
	y[index] = x[index] + 10.0f;
	}
	}


	int main(int argc, char *argv[]) {

	long channel = 2;
	long height = 3;
	long width = 4;
	long length = channelheightwidth;

	// assign initial values
	float *input = new float[length];
	float *output = new float[length];
	for (int i = 0; i < length; i++)
	input[i] = i;

	// memcpy host to device
	float *input_cuda;
	float *output_cuda;
	cudaMalloc((void*)&input_cuda, lengthsizeof(float));
	cudaMalloc((void*)&output_cuda, lengthsizeof(float));
	cudaMemcpy(input_cuda, input, length*sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(output_cuda, output, length*sizeof(float), cudaMemcpyHostToDevice);


	// launch cuda kernel
	yourkernel <<< GET_BLOCKS(length), CUDA_NUM_THREADS >>> (length, input_cuda, output_cuda);


	// memcpy device to host
	cudaMemcpy(input, input_cuda, length*sizeof(float), cudaMemcpyDeviceToHost);
	cudaMemcpy(output, output_cuda, length*sizeof(float), cudaMemcpyDeviceToHost);


	// print input
	printf("==> input\n");
	for (int i = 0; i < channel; i++) {
	for (int j = 0; j < height; j++) {
	for (int k = 0; k < width; k++) {
	printf("%5.2f\t", input[iheightwidth + j*width + k]);
	}
	printf("\n");
	}
	printf("\n");
	}

	// print output
	printf("==> output\n");
	for (int i = 0; i < channel; i++) {
	for (int j = 0; j < height; j++) {
	for (int k = 0; k < width; k++) {
	printf("%5.2f\t", output[iheightwidth + j*width + k]);
	}
	printf("\n");
	}
	printf("\n");
	}


	// check last cuda error
	cudaError errcode = cudaGetLastError();
	if (errcode != cudaSuccess)
	printf("%s\n", cudaGetErrorString(errcode));

	// free all
	cudaFree(input_cuda);
	cudaFree(output_cuda);
	delete[] input;
	delete[] output;
	return 0;
	}