Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A primitive example of threaded code in CUDA.
#include <stdio.h>
//
//
//
#define LAUNCH_BOUNDS // __launch_bounds__(512)
#define DEVICE_FUNCTION_QUALIFIERS __device__
//
//
//
typedef void (*threadedFunc)(int* const data);
//
//
//
DEVICE_FUNCTION_QUALIFIERS
void cudaExit(int* const data)
{
asm("exit;");
}
DEVICE_FUNCTION_QUALIFIERS
void foo(int* const data)
{
data[threadIdx.x] = 5;
}
DEVICE_FUNCTION_QUALIFIERS
void bar(int* const data)
{
data[threadIdx.x] *= 10;
}
DEVICE_FUNCTION_QUALIFIERS
void baz(int* const data)
{
data[threadIdx.x] += 15;
}
//
// DICTIONARY IS A TABLE OF ALL SUBROUTINES
//
// PROGRAM IS AN ARRAY OF SUBROUTINE ADDRESSES
//
// PROGRAM IS INITIALIZED BY A cudaMemcpyToSymbol
//
// END WITH AN EXIT IF YOU WANT TO RETURN.
//
// INSTEAD OF A __constant__ THIS COULD BE ANOTHER KIND OF
// HOST-MODIFIABLE MEMORY SPACE.
//
__constant__ threadedFunc deviceDictionary[] = { cudaExit, foo, bar, baz };
__constant__ threadedFunc deviceProgram[256];
//
//
//
__global__
LAUNCH_BOUNDS
void start(int* const data)
{
unsigned int pc = 0;
while (true)
deviceProgram[pc++](data);
}
//
//
//
//////////////////////////////////////////////////////////////////////////////
//
// HOST
//
// DEFINE A SUBROUTINE THREADED PROGRAM
const int prog20[] = {
1, // FOO -- INIT TO 5
3, // BAZ -- ADD 15
0 // EXIT
};
const int prog35[] = {
1, // FOO -- INIT TO 5
3, // BAZ -- ADD 15
3, // BAZ -- ADD 15
0 // EXIT
};
//
//
//
#define NUM_THREADS 32
#define DEVICE_DATA_SIZE (NUM_THREADS*sizeof(int))
#define PROGRAM prog35
//
//
//
int main(int argc, char **argv)
{
cudaError_t err;
int* deviceData;
cudaMalloc(&deviceData,DEVICE_DATA_SIZE);
//
// COPY DICTIONARY TO HOST
//
threadedFunc hostDictionary[256];
size_t dictionarySize;
cudaGetSymbolSize(&dictionarySize,deviceDictionary);
cudaMemcpyFromSymbol(hostDictionary,deviceDictionary,dictionarySize);
//
// INITIALIZE A "PROGRAM" AND COPY IT TO THE DEVICE
//
threadedFunc hostProgram[256];
for (int ii=0, jj=0; ii<(sizeof(PROGRAM)/sizeof(int)); ii++)
hostProgram[jj++] = hostDictionary[PROGRAM[ii]];
cudaMemcpyToSymbol(deviceProgram,hostProgram,sizeof(hostProgram));
//
// LAUNCH KERNEL
//
start<<<1,NUM_THREADS>>>(deviceData);
err = cudaDeviceSynchronize();
//
// LOOK AT RESULTS
//
int* hostData = (int*)malloc(DEVICE_DATA_SIZE);
cudaMemcpy(hostData,deviceData,DEVICE_DATA_SIZE,cudaMemcpyDeviceToHost);
for (int ii=0; ii<NUM_THREADS; ii++)
printf("%4d ",hostData[ii]);
printf("\n");
//
//
//
cudaFree(deviceData);
free(hostData);
return 0;
}
@allanmac
Copy link
Author

allanmac commented Apr 11, 2016

Output: cuobjdump -sass threadedCode.exe

 code for sm_52
                Function : _Z5startPi
        .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                 /* 0x081fd000fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];         /* 0x4c98078000870001 */
        /*0010*/                   MOV R1, RZ;                   /* 0x5c9807800ff70001 */
        /*0018*/                   SHL R3, R1.reuse, 0x3;        /* 0x3848000000370103 */
                                                                 /* 0x001fc400fe200711 */
        /*0028*/                   LDC.64 R2, c[0x3][R3];        /* 0xef95003000070302 */
        /*0030*/                   IADD32I R0, R1, 0x1;          /* 0x1c00000000170100 */
        /*0038*/                   MOV R4, c[0x0][0x140];        /* 0x4c98078005070004 */
                                                                 /* 0x003fc400fcc007f4 */
        /*0048*/                   MOV R5, c[0x0][0x144];        /* 0x4c98078005170005 */
        /*0050*/                   MOV R1, R0;                   /* 0x5c98078000070001 */
        /*0058*/                   PRET 0x18;                    /* 0xe2700ffffb800040 */
                                                                 /* 0x083fc400e3e007fd */
        /*0068*/                   BRX R2 -0x70;                 /* 0xe2500ffff907020f */
        /*0070*/                   S2R R3, SR_TID.X;             /* 0xf0c8000002170003 */
        /*0078*/                   SHL R0, R3.reuse, 0x2;        /* 0x3848000000270300 */
                                                                 /* 0x001fc400fec007f5 */
        /*0088*/                   SHR.U32 R3, R3, 0x1e;         /* 0x3828000001e70303 */
        /*0090*/                   IADD R0.CC, R0, R4;           /* 0x5c10800000470000 */
        /*0098*/                   IADD.X R3, R3, R5;            /* 0x5c10080000570303 */
                                                                 /* 0x0001c800fda207f6 */
        /*00a8*/                   LEA R4.CC, R0.reuse, RZ;      /* 0x5bd780000ff70004 */
        /*00b0*/                   LEA.HI.X P0, R5, R0, RZ, R3;  /* 0x5bd801c00ff70005 */
        /*00b8*/                   LD.E R0, [R4], P0;            /* 0x8090000000070400 */
                                                                 /* 0x0003f400fe8047e6 */
        /*00c8*/                   XMAD R3, R0, 0xa, RZ;         /* 0x36007f8000a70003 */
        /*00d0*/                   XMAD.PSL R3, R0.H1, 0xa, R3;  /* 0x3620019000a70003 */
        /*00d8*/                   ST.E [R4], R3, P0;            /* 0xa090000000070403 */
                                                                 /* 0x083fc400e3e00fff */
        /*00e8*/                   RET;                          /* 0xe32000000007000f */
        /*00f0*/                   S2R R3, SR_TID.X;             /* 0xf0c8000002170003 */
        /*00f8*/                   SHL R0, R3.reuse, 0x2;        /* 0x3848000000270300 */
                                                                 /* 0x001fc400fec007f5 */
        /*0108*/                   SHR.U32 R3, R3, 0x1e;         /* 0x3828000001e70303 */
        /*0110*/                   IADD R0.CC, R0, R4;           /* 0x5c10800000470000 */
        /*0118*/                   IADD.X R3, R3, R5;            /* 0x5c10080000570303 */
                                                                 /* 0x0001c800fda207f6 */
        /*0128*/                   LEA R4.CC, R0.reuse, RZ;      /* 0x5bd780000ff70004 */
        /*0130*/                   LEA.HI.X P0, R5, R0, RZ, R3;  /* 0x5bd801c00ff70005 */
        /*0138*/                   LD.E R0, [R4], P0;            /* 0x8090000000070400 */
                                                                 /* 0x003ffc001fa047e4 */
        /*0148*/                   IADD32I R3, R0, 0xf;          /* 0x1c00000000f70003 */
        /*0150*/                   ST.E [R4], R3, P0;            /* 0xa090000000070403 */
        /*0158*/                   RET;                          /* 0xe32000000007000f */
                                                                 /* 0x001fd441fe20071f */
        /*0168*/                   S2R R3, SR_TID.X;             /* 0xf0c8000002170003 */
        /*0170*/                   SHL R0, R3.reuse, 0x2;        /* 0x3848000000270300 */
        /*0178*/                   SHR.U32 R3, R3, 0x1e;         /* 0x3828000001e70303 */
                                                                 /* 0x081fd800fe2007f6 */
        /*0188*/                   IADD R0.CC, R0, R4;           /* 0x5c10800000470000 */
        /*0190*/                   IADD.X R3, R3, R5;            /* 0x5c10080000570303 */
        /*0198*/                   LEA R4.CC, R0.reuse, RZ;      /* 0x5bd780000ff70004 */
                                                                 /* 0x0003f400fd8007e1 */
        /*01a8*/                   LEA.HI.X P0, R5, R0, RZ, R3;  /* 0x5bd801c00ff70005 */
        /*01b0*/                   MOV32I R3, 0x5;               /* 0x010000000057f003 */
        /*01b8*/                   ST.E [R4], R3, P0;            /* 0xa090000000070403 */
                                                                 /* 0x001ffc00ffe00fff */
        /*01c8*/                   RET;                          /* 0xe32000000007000f */
        /*01d0*/                   EXIT;                         /* 0xe30000000007000f */
        /*01d8*/                   BRA 0x1d8;                    /* 0xe2400fffff87000f */
                                                                 /* 0x001f8000fc0007e0 */
        /*01e8*/                   NOP;                          /* 0x50b0000000070f00 */
        /*01f0*/                   NOP;                          /* 0x50b0000000070f00 */
        /*01f8*/                   NOP;                          /* 0x50b0000000070f00 */
                ...........................

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment