Skip to content

Instantly share code, notes, and snippets.

@NicholasShatokhin
Created September 23, 2012 10:31
Show Gist options
  • Save NicholasShatokhin/3769635 to your computer and use it in GitHub Desktop.
Save NicholasShatokhin/3769635 to your computer and use it in GitHub Desktop.
CUDA random
#include <cuda.h>
#include <curand_kernel.h>
#include <stdio.h>
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d -- %s\n",__FILE__,__LINE__, cudaGetErrorString(x)); \
return EXIT_FAILURE;}} while(0)
#define N 10000
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curand_init ( seed, idx, 0, &state[idx] );
}
__global__ void generate( curandState* globalState, float * randomArray )
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curandState localState = globalState[idx];
float RANDOM = curand_uniform( &localState );
randomArray[idx] = RANDOM;
globalState[idx] = localState;
}
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}
int main( int argc, char** argv)
{
// Number of CUDA devices
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...\n");
printf("There are %d CUDA devices.\n", devCount);
int dimension = 0;
// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
// Get device properties
printf("\nCUDA Device #%d\n", i);
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
dimension = devProp.maxThreadsDim[0];
printDevProp(devProp);
}
dim3 threads = dim3(dimension/2, 1);
int blocksCount = floor(N / threads.x) + 1;
dim3 blocks = dim3(blocksCount, 1);
curandState* devStates;
float * randomValues = new float[N];
float * devRandomValues;
CUDA_CALL(cudaMalloc ( &devStates, N*sizeof( curandState ) ));
CUDA_CALL(cudaMalloc ( &devRandomValues, N*sizeof( *randomValues ) ));
// setup seeds
setup_kernel <<<blocks, threads>>> ( devStates, time(NULL) );
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
// generate random numbers
generate <<<blocks, threads>>> ( devStates, devRandomValues );
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
CUDA_CALL(cudaMemcpy ( randomValues, devRandomValues, N*sizeof(*randomValues), cudaMemcpyDeviceToHost ));
for(int i=0;i<N;i++)
{
printf("%f\n", randomValues[i]);
}
CUDA_CALL(cudaFree(devRandomValues));
CUDA_CALL(cudaFree(devStates));
delete randomValues;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment