Skip to content

Instantly share code, notes, and snippets.

@allanmac
Created March 5, 2015 00:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/4a1851480ca9bf318f68 to your computer and use it in GitHub Desktop.
Save allanmac/4a1851480ca9bf318f68 to your computer and use it in GitHub Desktop.
ld.global.nc (LDG.CI) operations not being generated when const+restrict pointers are within a const struct passed as a kernel argument
//
//
//
#define WARP_SIZE 32
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#define RESTRICT __restrict__
//
//
//
KERNEL_QUALIFIERS
void ldg_good(const unsigned int* const RESTRICT vin,
unsigned int* const RESTRICT vout)
{
const unsigned int v0 = vin[threadIdx.x+32*0];
const unsigned int v1 = vin[threadIdx.x+32*1];
const unsigned int v2 = vin[threadIdx.x+32*2];
const unsigned int v3 = vin[threadIdx.x+32*3];
vout[threadIdx.x+32*0] = v0;
vout[threadIdx.x+32*1] = v1;
vout[threadIdx.x+32*2] = v2;
vout[threadIdx.x+32*3] = v3;
}
//
//
//
struct ldg_args
{
const unsigned int* RESTRICT vin;
unsigned int* RESTRICT vout;
};
KERNEL_QUALIFIERS
void ldg_bad(const struct ldg_args args)
{
const unsigned int v0 = args.vin[threadIdx.x+32*0];
const unsigned int v1 = args.vin[threadIdx.x+32*1];
const unsigned int v2 = args.vin[threadIdx.x+32*2];
const unsigned int v3 = args.vin[threadIdx.x+32*3];
args.vout[threadIdx.x+32*0] = v0;
args.vout[threadIdx.x+32*1] = v1;
args.vout[threadIdx.x+32*2] = v2;
args.vout[threadIdx.x+32*3] = v3;
}
//
//
//
@allanmac
Copy link
Author

allanmac commented Mar 5, 2015

Compile the attached snippet with:

nvcc -m 32 -arch sm_50 -Xptxas=-v -cubin ldg.cu

Dump the SASS with: cuobjdump.exe -sass ldg.cubin or nvdisasm ldg.cubin:

ldg_good:
.text.ldg_good:
        /*0008*/                   MOV R1, c[0x0][0x20];
        /*0010*/                   S2R R5, SR_TID.X;
        /*0018*/                   ISCADD R0, R5.reuse, c[0x0][0x140], 0x2;
        /*0028*/                   LDG.CI R6, [R0];
        /*0030*/                   LDG.CI R2, [R0+0x80];
        /*0038*/                   LDG.CI R3, [R0+0x100];
        /*0048*/                   LDG.CI R4, [R0+0x180];
        /*0050*/                   ISCADD R5, R5, c[0x0][0x144], 0x2;
        /*0058*/                   STG [R5], R6;
        /*0068*/                   STG [R5+0x80], R2;
        /*0070*/                   DEPBAR.LE SB5, 0x1;
        /*0078*/                   STG [R5+0x100], R3;
        /*0088*/                   STG [R5+0x180], R4;
        /*0090*/                   EXIT;
.L_1:
        /*0098*/                   BRA `(.L_1);

ldg_bad:
.text.ldg_bad:
        /*0008*/                   MOV R1, c[0x0][0x20];
        /*0010*/                   S2R R5, SR_TID.X;
        /*0018*/                   ISCADD R0, R5.reuse, c[0x0][0x140], 0x2;
        /*0028*/                   LDG R6, [R0];        <-------------------- BAD
        /*0030*/                   LDG R2, [R0+0x80];   <-------------------- BAD
        /*0038*/                   LDG R3, [R0+0x100];  <-------------------- BAD
        /*0048*/                   LDG R4, [R0+0x180];  <-------------------- BAD
        /*0050*/                   ISCADD R5, R5, c[0x0][0x144], 0x2;
        /*0058*/                   STG [R5], R6;
        /*0068*/                   STG [R5+0x80], R2;
        /*0070*/                   DEPBAR.LE SB5, 0x1;
        /*0078*/                   STG [R5+0x100], R3;
        /*0088*/                   STG [R5+0x180], R4;
        /*0090*/                   EXIT;
.L_2:
        /*0098*/                   BRA `(.L_2);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment