Created
February 6, 2013 20:20
-
-
Save allanmac/4725448 to your computer and use it in GitHub Desktop.
Exercise vector load and store. Also evaluate LDG.CT.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define RESTRICT __restrict | |
// | |
// | |
// | |
typedef unsigned char uchar; | |
typedef unsigned short ushort; | |
typedef unsigned int uint; | |
// | |
// | |
// | |
#define LD(type,i) \ | |
type r##i = vin[threadIdx.x+(i*sizeof(type)*WARP_SIZE)] | |
#define ST(type,i) \ | |
vout[threadIdx.x+(i*sizeof(type)*WARP_SIZE)] = r##i | |
// | |
// | |
// | |
#define LD_1(type) \ | |
LD(type,0) | |
#define LD_2(type) \ | |
LD_1(type); \ | |
LD(type,1) | |
#define LD_4(type) \ | |
LD_2(type); \ | |
LD(type,2); \ | |
LD(type,3) | |
#define LD_8(type) \ | |
LD_4(type); \ | |
LD(type,4); \ | |
LD(type,5); \ | |
LD(type,6); \ | |
LD(type,7) | |
#define LD_14(type) \ | |
LD_8(type); \ | |
LD(type,8); \ | |
LD(type,9); \ | |
LD(type,10); \ | |
LD(type,11); \ | |
LD(type,12); \ | |
LD(type,13) | |
#define LD_16(type) \ | |
LD_8(type); \ | |
LD(type,8); \ | |
LD(type,9); \ | |
LD(type,10); \ | |
LD(type,11); \ | |
LD(type,12); \ | |
LD(type,13); \ | |
LD(type,14); \ | |
LD(type,15) | |
// | |
// | |
// | |
#define ST_1(type) \ | |
ST(type,0) | |
#define ST_2(type) \ | |
ST_1(type); \ | |
ST(type,1) | |
#define ST_4(type) \ | |
ST_2(type); \ | |
ST(type,2); \ | |
ST(type,3) | |
#define ST_8(type) \ | |
ST_4(type); \ | |
ST(type,4); \ | |
ST(type,5); \ | |
ST(type,6); \ | |
ST(type,7) | |
#define ST_14(type) \ | |
ST_8(type); \ | |
ST(type,8); \ | |
ST(type,9); \ | |
ST(type,10); \ | |
ST(type,11); \ | |
ST(type,12); \ | |
ST(type,13) | |
#define ST_16(type) \ | |
ST_8(type); \ | |
ST(type,8); \ | |
ST(type,9); \ | |
ST(type,10); \ | |
ST(type,11); \ | |
ST(type,12); \ | |
ST(type,13); \ | |
ST(type,14); \ | |
ST(type,15) | |
// | |
// | |
// | |
#define LOAD(type) \ | |
LD_14(type) | |
#define STOR(type) \ | |
ST_14(type) | |
// | |
// | |
// | |
__global__ | |
void u1(const uchar* const RESTRICT vin, | |
uchar* const RESTRICT vout) | |
{ | |
LOAD(uchar); | |
STOR(uchar); | |
} | |
__global__ | |
void u2(const ushort* const RESTRICT vin, | |
ushort* const RESTRICT vout) | |
{ | |
LOAD(ushort); | |
STOR(ushort); | |
} | |
__global__ | |
void u4(const uint* const RESTRICT vin, | |
uint* const RESTRICT vout) | |
{ | |
LOAD(uint); | |
STOR(uint); | |
} | |
__global__ | |
void u8(const uint2* const RESTRICT vin, | |
uint2* const RESTRICT vout) | |
{ | |
LOAD(uint2); | |
STOR(uint2); | |
} | |
__global__ | |
void u16(const uint4* const RESTRICT vin, | |
uint4* const RESTRICT vout) | |
{ | |
LOAD(uint4); | |
STOR(uint4); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compiled with:
nvcc -m 32 -arch sm_35 -Xptxas=-v -cubin gmem.cu
TEXDEPBAR is interesting: