anddev68/gist:6000cc0a5af1ea2f1fc5

## gistfile1.txt
/*
	CUDA サンプルプログラム1
	http://cudasample.net/sample/1/003.html

	タイマーについてはこの記事から
	http://tnishinaga.hatenablog.com/entry/2014/08/10/215445
*/


#include <stdio.h>

/* CUDAのインテリセンスを有効にする */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_timer.h>

/* インテリセンスのエラー対策 */
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif

/* 定数定義 */
typedef unsigned int uint;
#define N (512*2014)

/* マクロ定義 */
#define PRINT_ERR(err) if(err!=cudaSuccess){fprintf(stderr,"%s\n",cudaGetErrorString(err));}


/* スイッチマクロ */
//	CPUで実行する場合はコメントアウトを消す
#define EXEC_CPU
//	時間計測にイベントを利用する場合はコメントアウトを消す
//	#define MEASURE_WITH_EVENT


/*
	並列で1を足す
*/
__global__ void kernel(int* data){
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[index] += 1;
}

/*
	cpuで1をタス
*/
void cpu(int *data,int n){
	int i;
	for (i = 0; i < n; i++)
		data[i] += 1;

}


int main(void){

	int *host_data;
	int *device_data;
	int i;
	cudaError_t err;
	float time;
#ifndef MEASURE_WITH_EVENT
	StopWatchInterface *timer = NULL;	//	timerで計測
#else
	cudaEvent_t ev[2];	//	eventで計測
#endif

	/* データ初期化 */
	host_data = (int *)malloc(sizeof(int)*N);
	for (i = 0; i < N; i++){
		host_data[i] = i;
	}

	/* メモリ確保 */
	err = cudaMalloc((void **)&device_data, sizeof(int)*N);
	PRINT_ERR(err);

	/* ホスト→デバイスメモリ */
	err = cudaMemcpy(device_data, host_data, sizeof(int) * N, cudaMemcpyHostToDevice);
	PRINT_ERR(err);

	/* スレッドとブロックの設定 */
	dim3 blocks(N/512, 1,1);	//	データ数/スレッド分のブロックを用意
	dim3 threads(512, 1,1);	//	512スレッドを用意

	/* 時間計測のための初期化を行う */
#ifndef MEASURE_WITH_EVENT
	//	Timerを使う場合
	sdkCreateTimer(&timer);
	sdkResetTimer(&timer);
	sdkStartTimer(&timer);
#else
	/* イベント初期化 */
	for (i = 0; i < 2; i++) cudaEventCreate(&ev[i], 0);
	cudaEventRecord(ev[0]);
#endif


	/* 実行する */
#ifdef EXEC_CPU
	//	cpuで実行
	cpu(host_data, N);
#else
	//	カーネルで実行
	kernel << <blocks, threads >> >(device_data);
#endif

	/* 時間計測終了処理 */
#ifndef MEASURE_WITH_EVENT
	//	Timerを使う場合
	sdkStopTimer(&timer);
	time = sdkGetTimerValue(&timer);
	sdkDeleteTimer(&timer);
#else
	//	タイマー停止
	cudaEventRecord(ev[1]);
	cudaEventSynchronize(ev[1]);
	//	実行時間計算
	cudaEventElapsedTime(&time, ev[0], ev[1]);
#endif

	/* 実行時間表示 */
	printf("time:%f[ms]", time);

	/* エラー処理 */
	//err = cudaGetLastError();
	//PRINT_ERR(err);

	/* デバイス→ホストメモリ */
	cudaMemcpy(host_data, device_data, sizeof(int)*N, cudaMemcpyDeviceToHost);

	/* 表示 */
	//for (i = 0; i < N; i++){
	//	if (i%=1024)
	//		printf("data[%8d]=%d\n",i,host_data[i]);
	//}

	/* メモリ解法 */
	free(host_data);
	cudaFree(device_data);


	return 0;
}
	/*
	CUDA サンプルプログラム1
	http://cudasample.net/sample/1/003.html

	タイマーについてはこの記事から
	http://tnishinaga.hatenablog.com/entry/2014/08/10/215445
	*/



	#include <stdio.h>

	/* CUDAのインテリセンスを有効にする */
	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"
	#include <cuda.h>
	#include <device_functions.h>
	#include <cuda_runtime_api.h>

	#include <helper_functions.h>
	#include <helper_cuda.h>
	#include <helper_timer.h>

	/* インテリセンスのエラー対策 */
	#pragma once
	#ifdef __INTELLISENSE__
	void __syncthreads();
	#endif

	/* 定数定義 */
	typedef unsigned int uint;
	#define N (512*2014)

	/* マクロ定義 */
	#define PRINT_ERR(err) if(err!=cudaSuccess){fprintf(stderr,"%s\n",cudaGetErrorString(err));}


	/* スイッチマクロ */
	// CPUで実行する場合はコメントアウトを消す
	#define EXEC_CPU
	// 時間計測にイベントを利用する場合はコメントアウトを消す
	// #define MEASURE_WITH_EVENT


	/*
	並列で1を足す
	*/
	__global__ void kernel(int* data){
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[index] += 1;
	}

	/*
	cpuで1をタス
	*/
	void cpu(int *data,int n){
	int i;
	for (i = 0; i < n; i++)
	data[i] += 1;

	}





	int main(void){

	int *host_data;
	int *device_data;
	int i;
	cudaError_t err;
	float time;
	#ifndef MEASURE_WITH_EVENT
	StopWatchInterface *timer = NULL; // timerで計測
	#else
	cudaEvent_t ev[2]; // eventで計測
	#endif

	/* データ初期化 */
	host_data = (int )malloc(sizeof(int)N);
	for (i = 0; i < N; i++){
	host_data[i] = i;
	}

	/* メモリ確保 */
	err = cudaMalloc((void *)&device_data, sizeof(int)N);
	PRINT_ERR(err);

	/* ホスト→デバイスメモリ */
	err = cudaMemcpy(device_data, host_data, sizeof(int) * N, cudaMemcpyHostToDevice);
	PRINT_ERR(err);

	/* スレッドとブロックの設定 */
	dim3 blocks(N/512, 1,1); // データ数/スレッド分のブロックを用意
	dim3 threads(512, 1,1); // 512スレッドを用意

	/* 時間計測のための初期化を行う */
	#ifndef MEASURE_WITH_EVENT
	// Timerを使う場合
	sdkCreateTimer(&timer);
	sdkResetTimer(&timer);
	sdkStartTimer(&timer);
	#else
	/* イベント初期化 */
	for (i = 0; i < 2; i++) cudaEventCreate(&ev[i], 0);
	cudaEventRecord(ev[0]);
	#endif


	/* 実行する */
	#ifdef EXEC_CPU
	// cpuで実行
	cpu(host_data, N);
	#else
	// カーネルで実行
	kernel << <blocks, threads >> >(device_data);
	#endif

	/* 時間計測終了処理 */
	#ifndef MEASURE_WITH_EVENT
	// Timerを使う場合
	sdkStopTimer(&timer);
	time = sdkGetTimerValue(&timer);
	sdkDeleteTimer(&timer);
	#else
	// タイマー停止
	cudaEventRecord(ev[1]);
	cudaEventSynchronize(ev[1]);
	// 実行時間計算
	cudaEventElapsedTime(&time, ev[0], ev[1]);
	#endif

	/* 実行時間表示 */
	printf("time:%f[ms]", time);

	/* エラー処理 */
	//err = cudaGetLastError();
	//PRINT_ERR(err);

	/* デバイス→ホストメモリ */
	cudaMemcpy(host_data, device_data, sizeof(int)*N, cudaMemcpyDeviceToHost);

	/* 表示 */
	//for (i = 0; i < N; i++){
	// if (i%=1024)
	// printf("data[%8d]=%d\n",i,host_data[i]);
	//}

	/* メモリ解法 */
	free(host_data);
	cudaFree(device_data);




	return 0;
	}