Created
December 9, 2012 05:42
-
-
Save allanmac/4243479 to your computer and use it in GitHub Desktop.
A primitive example of threaded code in CUDA.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// | |
// | |
// | |
#define LAUNCH_BOUNDS // __launch_bounds__(512) | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ | |
// | |
// | |
// | |
typedef void (*threadedFunc)(int* const data); | |
// | |
// | |
// | |
DEVICE_FUNCTION_QUALIFIERS | |
void cudaExit(int* const data) | |
{ | |
asm("exit;"); | |
} | |
DEVICE_FUNCTION_QUALIFIERS | |
void foo(int* const data) | |
{ | |
data[threadIdx.x] = 5; | |
} | |
DEVICE_FUNCTION_QUALIFIERS | |
void bar(int* const data) | |
{ | |
data[threadIdx.x] *= 10; | |
} | |
DEVICE_FUNCTION_QUALIFIERS | |
void baz(int* const data) | |
{ | |
data[threadIdx.x] += 15; | |
} | |
// | |
// DICTIONARY IS A TABLE OF ALL SUBROUTINES | |
// | |
// PROGRAM IS AN ARRAY OF SUBROUTINE ADDRESSES | |
// | |
// PROGRAM IS INITIALIZED BY A cudaMemcpyToSymbol | |
// | |
// END WITH AN EXIT IF YOU WANT TO RETURN. | |
// | |
// INSTEAD OF A __constant__ THIS COULD BE ANOTHER KIND OF | |
// HOST-MODIFIABLE MEMORY SPACE. | |
// | |
__constant__ threadedFunc deviceDictionary[] = { cudaExit, foo, bar, baz }; | |
__constant__ threadedFunc deviceProgram[256]; | |
// | |
// | |
// | |
__global__ | |
LAUNCH_BOUNDS | |
void start(int* const data) | |
{ | |
unsigned int pc = 0; | |
while (true) | |
deviceProgram[pc++](data); | |
} | |
// | |
// | |
// | |
////////////////////////////////////////////////////////////////////////////// | |
// | |
// HOST | |
// | |
// DEFINE A SUBROUTINE THREADED PROGRAM | |
const int prog20[] = { | |
1, // FOO -- INIT TO 5 | |
3, // BAZ -- ADD 15 | |
0 // EXIT | |
}; | |
const int prog35[] = { | |
1, // FOO -- INIT TO 5 | |
3, // BAZ -- ADD 15 | |
3, // BAZ -- ADD 15 | |
0 // EXIT | |
}; | |
// | |
// | |
// | |
#define NUM_THREADS 32 | |
#define DEVICE_DATA_SIZE (NUM_THREADS*sizeof(int)) | |
#define PROGRAM prog35 | |
// | |
// | |
// | |
int main(int argc, char **argv) | |
{ | |
cudaError_t err; | |
int* deviceData; | |
cudaMalloc(&deviceData,DEVICE_DATA_SIZE); | |
// | |
// COPY DICTIONARY TO HOST | |
// | |
threadedFunc hostDictionary[256]; | |
size_t dictionarySize; | |
cudaGetSymbolSize(&dictionarySize,deviceDictionary); | |
cudaMemcpyFromSymbol(hostDictionary,deviceDictionary,dictionarySize); | |
// | |
// INITIALIZE A "PROGRAM" AND COPY IT TO THE DEVICE | |
// | |
threadedFunc hostProgram[256]; | |
for (int ii=0, jj=0; ii<(sizeof(PROGRAM)/sizeof(int)); ii++) | |
hostProgram[jj++] = hostDictionary[PROGRAM[ii]]; | |
cudaMemcpyToSymbol(deviceProgram,hostProgram,sizeof(hostProgram)); | |
// | |
// LAUNCH KERNEL | |
// | |
start<<<1,NUM_THREADS>>>(deviceData); | |
err = cudaDeviceSynchronize(); | |
// | |
// LOOK AT RESULTS | |
// | |
int* hostData = (int*)malloc(DEVICE_DATA_SIZE); | |
cudaMemcpy(hostData,deviceData,DEVICE_DATA_SIZE,cudaMemcpyDeviceToHost); | |
for (int ii=0; ii<NUM_THREADS; ii++) | |
printf("%4d ",hostData[ii]); | |
printf("\n"); | |
// | |
// | |
// | |
cudaFree(deviceData); | |
free(hostData); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output:
cuobjdump -sass threadedCode.exe