Skip to content

Instantly share code, notes, and snippets.

@kekonn
Created November 6, 2009 16:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kekonn/228070 to your computer and use it in GitHub Desktop.
Save kekonn/228070 to your computer and use it in GitHub Desktop.
/********************************************************************
* sample.cu
* This is a example of the CUDA program.
*********************************************************************/
#define MATRIX_SIZE 200
#define BLOCKSIZE 200
#define BLOCKDIM 200
#define BENCH_LOOP 100
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
/************************************************************************/
/* Init CUDA */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;}
#else
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized.\n");
return true;
}
#endif
/************************************************************************/
/* Example */
/************************************************************************/
__global__ static void HelloCUDA(float* a, float* b, float* c)
{
// Geoptimaliseerde versie
int idy = blockIdx.x;
int idx = threadIdx.x;
//__shared__ float s_c[(BLOCKSIZE * BLOCKSIZE)];
__shared__ float s_b[BLOCKSIZE];
float r_a = a[idy * BLOCKSIZE + idx];
float r_a_shift = a[idx * BLOCKSIZE + idy];
float r_c = c[idx];
float r_c_rev = c[BLOCKSIZE - idx];
//__syncthreads();
// Doe een matrix shift voor de inisialisatie van de output array b
float r_b_this = r_a_shift;
// c gebruiken we enkel in combinatie met de idx zodat er geen bank conflicts kunnen ontstaan tussen verschillende threads
// We plaatsen een for lus er rond voor de algemene execution time naar boven te halen
int loops;
for (loops=0; loops<50; loops++) {
// tel de helft van de matrix shift er bij
//s_b[idx] += (r_a_shift / 2);
// tel de helft van de normale waarden er bij
//s_b[idx] += (r_a / 2);
// Tel de waarde van de mini array c er bij
//s_b[idx] += r_c;
// maak er 1 berekening van
r_b_this += ((r_a_shift + r_a) / 2) + r_c;
// controleer of deze groter dan 50 is, en zoja maakt deze kleiner
if( r_b_this < 50 ){
r_b_this = ( r_b_this / 100 ) * ( r_c / 20 );
}
//__syncthreads();
if(idx % 2 == 0){
r_b_this = r_b_this - 20 + r_c_rev;
}
//__syncthreads();
}
s_b[idx] = r_b_this;
__syncthreads();
// neem het gemiddelde van die rij en plaats deze op de laatste collom
if(idx + 1 == BLOCKSIZE){
int i , som;
for (i=0; i<(MATRIX_SIZE); i++) {
som += s_b[i];
}
som = som / (BLOCKSIZE);
// gebruik wel idy aangezien een blok dus 2 rijen doet in realiteit
b[idy * MATRIX_SIZE + idx] = som;
}else{
// Plaatsen van resultaten in het global memory
b[idy * MATRIX_SIZE + idx] = r_b_this;
}
}
void fill_matrix( float matrix[MATRIX_SIZE * MATRIX_SIZE])
{
int x, y;
for (x=0; x<MATRIX_SIZE; x++) {
for (y=0; y<MATRIX_SIZE; y++) {
matrix[x * MATRIX_SIZE + y] = (float)((float)rand()/(float)RAND_MAX) * 100.0f;
}
}
}
void fill_mini_matrix( float matrix[BLOCKSIZE])
{
int x;
for (x=0; x<BLOCKSIZE; x++) {
matrix[x] = (float)((float)rand()/(float)RAND_MAX) * 100.0f;
}
}
/************************************************************************/
/* HelloCUDA */
/************************************************************************/
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
//char *device_result = 0;
//char host_result[12] ={0};
float* device_matrix_in;
float host_matrix_in[MATRIX_SIZE * MATRIX_SIZE];
float* device_mini_matrix_in;
float host_mini_matrix_in[BLOCKSIZE];
float* device_matrix_out;
float host_matrix_out[MATRIX_SIZE * MATRIX_SIZE];
fill_matrix(host_matrix_in);
fill_mini_matrix(host_mini_matrix_in);
CUDA_SAFE_CALL( cudaMalloc((void**) &device_matrix_in, sizeof(float) * MATRIX_SIZE * MATRIX_SIZE));
CUDA_SAFE_CALL( cudaMalloc((void**) &device_mini_matrix_in, sizeof(float) * BLOCKSIZE));
CUDA_SAFE_CALL( cudaMalloc((void**) &device_matrix_out, sizeof(float) * MATRIX_SIZE * MATRIX_SIZE));
unsigned int timer = 0;
CUDA_SAFE_CALL(cudaMemcpy(device_matrix_in,host_matrix_in, (MATRIX_SIZE * MATRIX_SIZE * sizeof(float)), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(device_mini_matrix_in,host_mini_matrix_in, (BLOCKSIZE * sizeof(float)), cudaMemcpyHostToDevice));
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
int x;
for (x=0; x<BENCH_LOOP; x++) {
HelloCUDA<<< BLOCKDIM , BLOCKSIZE , 0>>>(device_matrix_in, device_matrix_out, device_mini_matrix_in);
}
CUT_CHECK_ERROR("Kernel execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", (cutGetTimerValue( timer) / BENCH_LOOP ));
CUT_SAFE_CALL( cutDeleteTimer( timer));
CUDA_SAFE_CALL(cudaMemcpy(host_matrix_out,device_matrix_out , (MATRIX_SIZE * MATRIX_SIZE * sizeof(float)), cudaMemcpyDeviceToHost));
//CUDA_SAFE_CALL( cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));
printf("Done \n");
//char string [256];
//gets(string);
CUDA_SAFE_CALL( cudaFree(device_matrix_in));
CUDA_SAFE_CALL( cudaFree(device_matrix_out));
//cudaThreadExit();
CUT_EXIT(argc, argv);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment