Created
November 6, 2009 16:13
-
-
Save kekonn/228070 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/******************************************************************** | |
* sample.cu | |
* This is a example of the CUDA program. | |
*********************************************************************/ | |
#define MATRIX_SIZE 200 | |
#define BLOCKSIZE 200 | |
#define BLOCKDIM 200 | |
#define BENCH_LOOP 100 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <cuda_runtime.h> | |
#include <cutil.h> | |
/************************************************************************/ | |
/* Init CUDA */ | |
/************************************************************************/ | |
#if __DEVICE_EMULATION__ | |
bool InitCUDA(void){return true;} | |
#else | |
bool InitCUDA(void) | |
{ | |
int count = 0; | |
int i = 0; | |
cudaGetDeviceCount(&count); | |
if(count == 0) { | |
fprintf(stderr, "There is no device.\n"); | |
return false; | |
} | |
for(i = 0; i < count; i++) { | |
cudaDeviceProp prop; | |
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { | |
if(prop.major >= 1) { | |
break; | |
} | |
} | |
} | |
if(i == count) { | |
fprintf(stderr, "There is no device supporting CUDA.\n"); | |
return false; | |
} | |
cudaSetDevice(i); | |
printf("CUDA initialized.\n"); | |
return true; | |
} | |
#endif | |
/************************************************************************/ | |
/* Example */ | |
/************************************************************************/ | |
__global__ static void HelloCUDA(float* a, float* b, float* c) | |
{ | |
// Geoptimaliseerde versie | |
int idy = blockIdx.x; | |
int idx = threadIdx.x; | |
//__shared__ float s_c[(BLOCKSIZE * BLOCKSIZE)]; | |
__shared__ float s_b[BLOCKSIZE]; | |
float r_a = a[idy * BLOCKSIZE + idx]; | |
float r_a_shift = a[idx * BLOCKSIZE + idy]; | |
float r_c = c[idx]; | |
float r_c_rev = c[BLOCKSIZE - idx]; | |
//__syncthreads(); | |
// Doe een matrix shift voor de inisialisatie van de output array b | |
float r_b_this = r_a_shift; | |
// c gebruiken we enkel in combinatie met de idx zodat er geen bank conflicts kunnen ontstaan tussen verschillende threads | |
// We plaatsen een for lus er rond voor de algemene execution time naar boven te halen | |
int loops; | |
for (loops=0; loops<50; loops++) { | |
// tel de helft van de matrix shift er bij | |
//s_b[idx] += (r_a_shift / 2); | |
// tel de helft van de normale waarden er bij | |
//s_b[idx] += (r_a / 2); | |
// Tel de waarde van de mini array c er bij | |
//s_b[idx] += r_c; | |
// maak er 1 berekening van | |
r_b_this += ((r_a_shift + r_a) / 2) + r_c; | |
// controleer of deze groter dan 50 is, en zoja maakt deze kleiner | |
if( r_b_this < 50 ){ | |
r_b_this = ( r_b_this / 100 ) * ( r_c / 20 ); | |
} | |
//__syncthreads(); | |
if(idx % 2 == 0){ | |
r_b_this = r_b_this - 20 + r_c_rev; | |
} | |
//__syncthreads(); | |
} | |
s_b[idx] = r_b_this; | |
__syncthreads(); | |
// neem het gemiddelde van die rij en plaats deze op de laatste collom | |
if(idx + 1 == BLOCKSIZE){ | |
int i , som; | |
for (i=0; i<(MATRIX_SIZE); i++) { | |
som += s_b[i]; | |
} | |
som = som / (BLOCKSIZE); | |
// gebruik wel idy aangezien een blok dus 2 rijen doet in realiteit | |
b[idy * MATRIX_SIZE + idx] = som; | |
}else{ | |
// Plaatsen van resultaten in het global memory | |
b[idy * MATRIX_SIZE + idx] = r_b_this; | |
} | |
} | |
void fill_matrix( float matrix[MATRIX_SIZE * MATRIX_SIZE]) | |
{ | |
int x, y; | |
for (x=0; x<MATRIX_SIZE; x++) { | |
for (y=0; y<MATRIX_SIZE; y++) { | |
matrix[x * MATRIX_SIZE + y] = (float)((float)rand()/(float)RAND_MAX) * 100.0f; | |
} | |
} | |
} | |
void fill_mini_matrix( float matrix[BLOCKSIZE]) | |
{ | |
int x; | |
for (x=0; x<BLOCKSIZE; x++) { | |
matrix[x] = (float)((float)rand()/(float)RAND_MAX) * 100.0f; | |
} | |
} | |
/************************************************************************/ | |
/* HelloCUDA */ | |
/************************************************************************/ | |
int main(int argc, char* argv[]) | |
{ | |
if(!InitCUDA()) { | |
return 0; | |
} | |
//char *device_result = 0; | |
//char host_result[12] ={0}; | |
float* device_matrix_in; | |
float host_matrix_in[MATRIX_SIZE * MATRIX_SIZE]; | |
float* device_mini_matrix_in; | |
float host_mini_matrix_in[BLOCKSIZE]; | |
float* device_matrix_out; | |
float host_matrix_out[MATRIX_SIZE * MATRIX_SIZE]; | |
fill_matrix(host_matrix_in); | |
fill_mini_matrix(host_mini_matrix_in); | |
CUDA_SAFE_CALL( cudaMalloc((void**) &device_matrix_in, sizeof(float) * MATRIX_SIZE * MATRIX_SIZE)); | |
CUDA_SAFE_CALL( cudaMalloc((void**) &device_mini_matrix_in, sizeof(float) * BLOCKSIZE)); | |
CUDA_SAFE_CALL( cudaMalloc((void**) &device_matrix_out, sizeof(float) * MATRIX_SIZE * MATRIX_SIZE)); | |
unsigned int timer = 0; | |
CUDA_SAFE_CALL(cudaMemcpy(device_matrix_in,host_matrix_in, (MATRIX_SIZE * MATRIX_SIZE * sizeof(float)), cudaMemcpyHostToDevice)); | |
CUDA_SAFE_CALL(cudaMemcpy(device_mini_matrix_in,host_mini_matrix_in, (BLOCKSIZE * sizeof(float)), cudaMemcpyHostToDevice)); | |
CUT_SAFE_CALL( cutCreateTimer( &timer)); | |
CUT_SAFE_CALL( cutStartTimer( timer)); | |
int x; | |
for (x=0; x<BENCH_LOOP; x++) { | |
HelloCUDA<<< BLOCKDIM , BLOCKSIZE , 0>>>(device_matrix_in, device_matrix_out, device_mini_matrix_in); | |
} | |
CUT_CHECK_ERROR("Kernel execution failed\n"); | |
CUDA_SAFE_CALL( cudaThreadSynchronize() ); | |
CUT_SAFE_CALL( cutStopTimer( timer)); | |
printf("Processing time: %f (ms)\n", (cutGetTimerValue( timer) / BENCH_LOOP )); | |
CUT_SAFE_CALL( cutDeleteTimer( timer)); | |
CUDA_SAFE_CALL(cudaMemcpy(host_matrix_out,device_matrix_out , (MATRIX_SIZE * MATRIX_SIZE * sizeof(float)), cudaMemcpyDeviceToHost)); | |
//CUDA_SAFE_CALL( cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost)); | |
printf("Done \n"); | |
//char string [256]; | |
//gets(string); | |
CUDA_SAFE_CALL( cudaFree(device_matrix_in)); | |
CUDA_SAFE_CALL( cudaFree(device_matrix_out)); | |
//cudaThreadExit(); | |
CUT_EXIT(argc, argv); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment