Skip to content

Instantly share code, notes, and snippets.

@allanmac
Created February 6, 2013 20:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/4725448 to your computer and use it in GitHub Desktop.
Save allanmac/4725448 to your computer and use it in GitHub Desktop.
Exercise vector load and store. Also evaluate LDG.CT.
//
//
//
#define WARP_SIZE 32
#define RESTRICT __restrict
//
//
//
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
//
//
//
#define LD(type,i) \
type r##i = vin[threadIdx.x+(i*sizeof(type)*WARP_SIZE)]
#define ST(type,i) \
vout[threadIdx.x+(i*sizeof(type)*WARP_SIZE)] = r##i
//
//
//
#define LD_1(type) \
LD(type,0)
#define LD_2(type) \
LD_1(type); \
LD(type,1)
#define LD_4(type) \
LD_2(type); \
LD(type,2); \
LD(type,3)
#define LD_8(type) \
LD_4(type); \
LD(type,4); \
LD(type,5); \
LD(type,6); \
LD(type,7)
#define LD_14(type) \
LD_8(type); \
LD(type,8); \
LD(type,9); \
LD(type,10); \
LD(type,11); \
LD(type,12); \
LD(type,13)
#define LD_16(type) \
LD_8(type); \
LD(type,8); \
LD(type,9); \
LD(type,10); \
LD(type,11); \
LD(type,12); \
LD(type,13); \
LD(type,14); \
LD(type,15)
//
//
//
#define ST_1(type) \
ST(type,0)
#define ST_2(type) \
ST_1(type); \
ST(type,1)
#define ST_4(type) \
ST_2(type); \
ST(type,2); \
ST(type,3)
#define ST_8(type) \
ST_4(type); \
ST(type,4); \
ST(type,5); \
ST(type,6); \
ST(type,7)
#define ST_14(type) \
ST_8(type); \
ST(type,8); \
ST(type,9); \
ST(type,10); \
ST(type,11); \
ST(type,12); \
ST(type,13)
#define ST_16(type) \
ST_8(type); \
ST(type,8); \
ST(type,9); \
ST(type,10); \
ST(type,11); \
ST(type,12); \
ST(type,13); \
ST(type,14); \
ST(type,15)
//
//
//
#define LOAD(type) \
LD_14(type)
#define STOR(type) \
ST_14(type)
//
//
//
__global__
void u1(const uchar* const RESTRICT vin,
uchar* const RESTRICT vout)
{
LOAD(uchar);
STOR(uchar);
}
__global__
void u2(const ushort* const RESTRICT vin,
ushort* const RESTRICT vout)
{
LOAD(ushort);
STOR(ushort);
}
__global__
void u4(const uint* const RESTRICT vin,
uint* const RESTRICT vout)
{
LOAD(uint);
STOR(uint);
}
__global__
void u8(const uint2* const RESTRICT vin,
uint2* const RESTRICT vout)
{
LOAD(uint2);
STOR(uint2);
}
__global__
void u16(const uint4* const RESTRICT vin,
uint4* const RESTRICT vout)
{
LOAD(uint4);
STOR(uint4);
}
@allanmac
Copy link
Author

allanmac commented Feb 6, 2013

Compiled with: nvcc -m 32 -arch sm_35 -Xptxas=-v -cubin gmem.cu

TEXDEPBAR is interesting:

    Function : _Z2u4PKjPj
/*0008*/     /*0x089c000664c03c00*/     MOV R1, c [0x0] [0x44];
/*0010*/     /*0x109c003e86400000*/     S2R R15, SR33;
/*0018*/     /*0x281c3c0260c00800*/     ISCADD R0, R15, c [0x0] [0x140], 0x2;
/*0020*/     /*0x001c0009c0800001*/     IADD R2, R0, 0x200;
/*0028*/     /*0x7f9c003960021004*/     LDG.CT.32 R14, [R0];
/*0030*/     /*0x289c3c3e60c00800*/     ISCADD R15, R15, c [0x0] [0x144], 0x2;
/*0038*/     /*0x7f9c083560021004*/     LDG.CT.32 R13, [R2];
/*0048*/     /*0x001c0009c0800002*/     IADD R2, R0, 0x400;
/*0050*/     /*0x7f9c083160021004*/     LDG.CT.32 R12, [R2];
/*0058*/     /*0x001c0009c0800003*/     IADD R2, R0, 0x600;
/*0060*/     /*0x7f9c082d60021004*/     LDG.CT.32 R11, [R2];
/*0068*/     /*0x001c0009c0800004*/     IADD R2, R0, 0x800;
/*0070*/     /*0x7f9c082960021004*/     LDG.CT.32 R10, [R2];
/*0078*/     /*0x001c0009c0800005*/     IADD R2, R0, 0xa00;
/*0088*/     /*0x7f9c082560021004*/     LDG.CT.32 R9, [R2];
/*0090*/     /*0x001c0009c0800006*/     IADD R2, R0, 0xc00;
/*0098*/     /*0x7f9c082160021004*/     LDG.CT.32 R8, [R2];
/*00a0*/     /*0x001c0009c0800007*/     IADD R2, R0, 0xe00;
/*00a8*/     /*0x7f9c081d60021004*/     LDG.CT.32 R7, [R2];
/*00b0*/     /*0x001c0009c0800008*/     IADD R2, R0, 0x1000;
/*00b8*/     /*0x7f9c081960021004*/     LDG.CT.32 R6, [R2];
/*00c8*/     /*0x001c0009c0800009*/     IADD R2, R0, 0x1200;
/*00d0*/     /*0x7f9c081560021004*/     LDG.CT.32 R5, [R2];
/*00d8*/     /*0x001c0009c080000a*/     IADD R2, R0, 0x1400;
/*00e0*/     /*0x7f9c081160021004*/     LDG.CT.32 R4, [R2];
/*00e8*/     /*0x001c0009c080000b*/     IADD R2, R0, 0x1600;
/*00f0*/     /*0x7f9c080d60021004*/     LDG.CT.32 R3, [R2];
/*00f8*/     /*0x001c0009c080000c*/     IADD R2, R0, 0x1800;
/*0108*/     /*0x001c0001c080000d*/     IADD R0, R0, 0x1a00;
/*0110*/     /*0x7f9c080960021004*/     LDG.CT.32 R2, [R2];
/*0118*/     /*0x7f9c000160021004*/     LDG.CT.32 R0, [R0];
/*0120*/     /*0x069c000277000000*/     TEXDEPBAR 0xd;
/*0128*/     /*0x001c3c38e4000000*/     ST [R15], R14;
/*0130*/     /*0x061c000277000000*/     TEXDEPBAR 0xc;
/*0138*/     /*0x001c3c34e4000001*/     ST [R15+0x200], R13;
/*0148*/     /*0x059c000277000000*/     TEXDEPBAR 0xb;
/*0150*/     /*0x001c3c30e4000002*/     ST [R15+0x400], R12;
/*0158*/     /*0x051c000277000000*/     TEXDEPBAR 0xa;
/*0160*/     /*0x001c3c2ce4000003*/     ST [R15+0x600], R11;
/*0168*/     /*0x049c000277000000*/     TEXDEPBAR 0x9;
/*0170*/     /*0x001c3c28e4000004*/     ST [R15+0x800], R10;
/*0178*/     /*0x041c000277000000*/     TEXDEPBAR 0x8;
/*0188*/     /*0x001c3c24e4000005*/     ST [R15+0xa00], R9;
/*0190*/     /*0x039c000277000000*/     TEXDEPBAR 0x7;
/*0198*/     /*0x001c3c20e4000006*/     ST [R15+0xc00], R8;
/*01a0*/     /*0x031c000277000000*/     TEXDEPBAR 0x6;
/*01a8*/     /*0x001c3c1ce4000007*/     ST [R15+0xe00], R7;
/*01b0*/     /*0x029c000277000000*/     TEXDEPBAR 0x5;
/*01b8*/     /*0x001c3c18e4000008*/     ST [R15+0x1000], R6;
/*01c8*/     /*0x021c000277000000*/     TEXDEPBAR 0x4;
/*01d0*/     /*0x001c3c14e4000009*/     ST [R15+0x1200], R5;
/*01d8*/     /*0x019c000277000000*/     TEXDEPBAR 0x3;
/*01e0*/     /*0x001c3c10e400000a*/     ST [R15+0x1400], R4;
/*01e8*/     /*0x011c000277000000*/     TEXDEPBAR 0x2;
/*01f0*/     /*0x001c3c0ce400000b*/     ST [R15+0x1600], R3;
/*01f8*/     /*0x009c000277000000*/     TEXDEPBAR 0x1;
/*0208*/     /*0x001c3c08e400000c*/     ST [R15+0x1800], R2;
/*0210*/     /*0x001c000277000000*/     TEXDEPBAR 0x0;
/*0218*/     /*0x001c3c00e400000d*/     ST [R15+0x1a00], R0;
/*0220*/     /*0x001c003c18000000*/     EXIT;
/*0228*/     /*0xfc1c003c12007fff*/     BRA 0x228;
/*0230*/     /*0x001c3c0285800000*/     NOP;
/*0238*/     /*0x001c3c0285800000*/     NOP;
    ...........................

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment