Skip to content

Instantly share code, notes, and snippets.

@jsteube
Created February 6, 2020 10:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jsteube/2e89e43bda98db61291bd07ed143cd55 to your computer and use it in GitHub Desktop.
Save jsteube/2e89e43bda98db61291bd07ed143cd55 to your computer and use it in GitHub Desktop.
NVCC issue POC
// I made this source code to demonstrate an issue which affects the compile process of CUDA kernels.
//
// - All GPU architectures are affected
// - Tested on CUDA SDK 10.x and 9.x
// - Tested on Ubuntu 18.04 LTS and Windows 10
__device__ void s (int *in, int *out);
__device__ void t (int *w0, int *w1, int *w2, int *w3, int *h);
// I created three artificial kernels. They all consist of 100% the same code.
// The code is just one function call. All three kernels call the same function using the same parameters.
__global__ void x1 (int *in, int *out)
{
s (in, out);
}
__global__ void x2 (int *in, int *out)
{
s (in, out);
}
__global__ void x3 (int *in, int *out)
{
s (in, out);
}
// Two of three kernels will end up in the same bytecode as expected.
// The third will create a different bytecode. This one will run at a reduced performance.
// This is not a theoretical problem.
// My real-world application suffers from this issue and runs at 15% reduced performance.
//
// Steps to reproduce:
//
// $ nvcc -arch=sm_75 -Xptxas="-v" 3k.cu 2>&1 | grep Used
// ptxas info : Used 70 registers, 368 bytes cmem[0]
// ptxas info : Used 53 registers, 368 bytes cmem[0]
// ptxas info : Used 53 registers, 368 bytes cmem[0]
//
// Alternative:
//
// $ nvcc -cubin -arch=sm_75 3k.cu
// $ cuobjdump --dump-resource-usage 3k.cubin | grep REG:
// REG:70 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:368 TEXTURE:0 SURFACE:0 SAMPLER:0
// REG:53 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:368 TEXTURE:0 SURFACE:0 SAMPLER:0
// REG:53 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:368 TEXTURE:0 SURFACE:0 SAMPLER:0
//
// The functions s() and t() are relevant to trigger the issue.
// Please do not try to see any sense in the code in the functions.
// They do not make sense anymore after I stripped them down as much as possible to simplify debugging for you.
__device__ void t (int *w0, int *w1, int *w2, int *w3, int *h)
{
#define STEP(a,b,x,K) \
{ \
a += K; \
a += x; \
a += b; \
}
int a = h[0];
int b = h[1];
int c = h[2];
int d = h[3];
STEP (a, b, w0[0], 0xd76aa478);
STEP (d, a, w0[1], 0xe8c7b756);
STEP (c, d, w0[2], 0x242070db);
STEP (b, c, w0[3], 0xc1bdceee);
STEP (a, b, w1[0], 0xf57c0faf);
STEP (d, a, w1[1], 0x4787c62a);
STEP (c, d, w1[2], 0xa8304613);
STEP (b, c, w1[3], 0xfd469501);
STEP (a, b, w2[0], 0x698098d8);
STEP (d, a, w2[1], 0x8b44f7af);
STEP (c, d, w2[2], 0xffff5bb1);
STEP (b, c, w2[3], 0x895cd7be);
STEP (a, b, w3[0], 0x6b901122);
STEP (d, a, w3[1], 0xfd987193);
STEP (c, d, w3[2], 0xa679438e);
STEP (b, c, w3[3], 0x49b40821);
STEP (a, b, w0[1], 0xd76aa478);
STEP (d, a, w1[2], 0xe8c7b756);
STEP (c, d, w2[3], 0x242070db);
STEP (b, c, w0[0], 0xc1bdceee);
STEP (a, b, w1[1], 0xf57c0faf);
STEP (d, a, w2[2], 0x4787c62a);
STEP (c, d, w3[3], 0xa8304613);
STEP (b, c, w1[0], 0xfd469501);
STEP (a, b, w2[1], 0x698098d8);
STEP (d, a, w3[2], 0x8b44f7af);
STEP (c, d, w0[3], 0xffff5bb1);
STEP (b, c, w2[0], 0x895cd7be);
STEP (a, b, w3[1], 0x6b901122);
STEP (d, a, w0[2], 0xfd987193);
STEP (c, d, w1[3], 0xa679438e);
STEP (b, c, w3[0], 0x49b40821);
STEP (a, b, w1[1], 0xd76aa478);
STEP (d, a, w2[0], 0xe8c7b756);
STEP (c, d, w2[3], 0x242070db);
STEP (b, c, w3[2], 0xc1bdceee);
STEP (a, b, w0[1], 0xf57c0faf);
STEP (d, a, w1[0], 0x4787c62a);
STEP (c, d, w1[3], 0xa8304613);
STEP (b, c, w2[2], 0xfd469501);
STEP (a, b, w3[1], 0x698098d8);
STEP (d, a, w0[0], 0x8b44f7af);
STEP (c, d, w0[3], 0xffff5bb1);
STEP (b, c, w1[2], 0x895cd7be);
STEP (a, b, w2[1], 0x6b901122);
STEP (d, a, w3[0], 0xfd987193);
STEP (c, d, w3[3], 0xa679438e);
STEP (b, c, w0[2], 0x49b40821);
STEP (a, b, w0[0], 0xd76aa478);
STEP (d, a, w1[3], 0xe8c7b756);
STEP (c, d, w3[2], 0x242070db);
STEP (b, c, w1[1], 0xc1bdceee);
STEP (a, b, w3[0], 0xf57c0faf);
STEP (d, a, w0[3], 0x4787c62a);
STEP (c, d, w2[2], 0xa8304613);
STEP (b, c, w0[1], 0xfd469501);
STEP (a, b, w2[0], 0x698098d8);
STEP (d, a, w3[3], 0x8b44f7af);
STEP (c, d, w1[2], 0xffff5bb1);
STEP (b, c, w3[1], 0x895cd7be);
STEP (a, b, w1[0], 0x6b901122);
STEP (d, a, w2[3], 0xfd987193);
STEP (c, d, w0[2], 0xa679438e);
STEP (b, c, w2[1], 0x49b40821);
h[0] += a;
h[1] += b;
h[2] += c;
h[3] += d;
}
__device__ void s (int *in, int *out)
{
int len = in[0];
int *buf = in;
for (int i = 0; i < 2; i++)
{
int w0[4] = { i };
int w1[4] = { 0 };
int w2[4] = { 0 };
int w3[4] = { 0 };
int ipad[4] = { 0 };
int opad[4] = { 0 };
t (w0, w0, w0, w0, ipad);
t (w0, w0, w0, w0, opad);
w0[0] = buf[0];
w0[1] = buf[0];
w0[2] = buf[0];
w0[3] = len;
int h[4];
t (w0, w0, w0, w0, h);
w0[0] = h[0];
w0[1] = h[1];
w0[2] = h[2];
w0[3] = h[3];
ipad[0] = 1;
ipad[1] = 1;
ipad[2] = 1;
ipad[3] = 1;
opad[0] = 1;
opad[1] = 1;
opad[2] = 1;
opad[3] = 1;
t (w0, w0, w0, w0, ipad);
t (w0, w0, w0, w0, opad);
int left;
int off;
for (left = len, off = 0; left >= 56; left -= 64, off += 16)
{
w0[0] = buf[off + 0];
w0[1] = buf[off + 1];
w0[2] = buf[off + 2];
w0[3] = buf[off + 3];
w1[0] = buf[off + 4];
w1[1] = buf[off + 5];
w1[2] = buf[off + 6];
w1[3] = buf[off + 7];
w2[0] = buf[off + 8];
w2[1] = buf[off + 9];
w2[2] = buf[off + 10];
w2[3] = buf[off + 11];
w3[0] = buf[off + 12];
w3[1] = buf[off + 13];
w3[2] = buf[off + 14];
w3[3] = buf[off + 15];
t (w0, w1, w2, w3, ipad);
}
w0[0] = buf[off + 0];
w0[1] = buf[off + 1];
w0[2] = buf[off + 2];
w0[3] = buf[off + 3];
w1[0] = buf[off + 4];
w1[1] = buf[off + 5];
w1[2] = buf[off + 6];
w1[3] = buf[off + 7];
w2[0] = buf[off + 8];
w2[1] = buf[off + 9];
w2[2] = buf[off + 10];
w2[3] = buf[off + 11];
w3[0] = buf[off + 12];
w3[1] = buf[off + 13];
w3[2] = len;
w3[3] = 0;
h[0] = ipad[0];
h[1] = ipad[1];
h[2] = ipad[2];
h[3] = ipad[3];
t (w0, w1, w2, w3, h);
w0[0] = h[0];
w0[1] = h[1];
w0[2] = h[2];
w0[3] = h[3];
h[0] = opad[0];
h[1] = opad[1];
h[2] = opad[2];
h[3] = opad[3];
t (w0, w0, w0, w0, h);
if (h[0] == 0) out[0] = 1;
}
}
// The main() function is just to demonstrate the issue is a system-wide issue in NVCC,
// and is not caused by any extra options used when calling NVCC.
// For easier debugging you can just compile into -cubin to trigger the same issue
int main () {}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment