Created
February 6, 2020 10:17
-
-
Save jsteube/2e89e43bda98db61291bd07ed143cd55 to your computer and use it in GitHub Desktop.
NVCC issue POC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// I made this source code to demonstrate an issue which affects the compile process of CUDA kernels. | |
// | |
// - All GPU architectures are affected | |
// - Tested on CUDA SDK 10.x and 9.x | |
// - Tested on Ubuntu 18.04 LTS and Windows 10 | |
__device__ void s (int *in, int *out); | |
__device__ void t (int *w0, int *w1, int *w2, int *w3, int *h); | |
// I created three artificial kernels. They all consist of 100% the same code. | |
// The code is just one function call. All three kernels call the same function using the same parameters. | |
__global__ void x1 (int *in, int *out) | |
{ | |
s (in, out); | |
} | |
__global__ void x2 (int *in, int *out) | |
{ | |
s (in, out); | |
} | |
__global__ void x3 (int *in, int *out) | |
{ | |
s (in, out); | |
} | |
// Two of three kernels will end up in the same bytecode as expected. | |
// The third will create a different bytecode. This one will run at a reduced performance. | |
// This is not a theoretical problem. | |
// My real-world application suffers from this issue and runs at 15% reduced performance. | |
// | |
// Steps to reproduce: | |
// | |
// $ nvcc -arch=sm_75 -Xptxas="-v" 3k.cu 2>&1 | grep Used | |
// ptxas info : Used 70 registers, 368 bytes cmem[0] | |
// ptxas info : Used 53 registers, 368 bytes cmem[0] | |
// ptxas info : Used 53 registers, 368 bytes cmem[0] | |
// | |
// Alternative: | |
// | |
// $ nvcc -cubin -arch=sm_75 3k.cu | |
// $ cuobjdump --dump-resource-usage 3k.cubin | grep REG: | |
// REG:70 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:368 TEXTURE:0 SURFACE:0 SAMPLER:0 | |
// REG:53 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:368 TEXTURE:0 SURFACE:0 SAMPLER:0 | |
// REG:53 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:368 TEXTURE:0 SURFACE:0 SAMPLER:0 | |
// | |
// The functions s() and t() are relevant to trigger the issue. | |
// Please do not try to see any sense in the code in the functions. | |
// They do not make sense anymore after I stripped them down as much as possible to simplify debugging for you. | |
__device__ void t (int *w0, int *w1, int *w2, int *w3, int *h) | |
{ | |
#define STEP(a,b,x,K) \ | |
{ \ | |
a += K; \ | |
a += x; \ | |
a += b; \ | |
} | |
int a = h[0]; | |
int b = h[1]; | |
int c = h[2]; | |
int d = h[3]; | |
STEP (a, b, w0[0], 0xd76aa478); | |
STEP (d, a, w0[1], 0xe8c7b756); | |
STEP (c, d, w0[2], 0x242070db); | |
STEP (b, c, w0[3], 0xc1bdceee); | |
STEP (a, b, w1[0], 0xf57c0faf); | |
STEP (d, a, w1[1], 0x4787c62a); | |
STEP (c, d, w1[2], 0xa8304613); | |
STEP (b, c, w1[3], 0xfd469501); | |
STEP (a, b, w2[0], 0x698098d8); | |
STEP (d, a, w2[1], 0x8b44f7af); | |
STEP (c, d, w2[2], 0xffff5bb1); | |
STEP (b, c, w2[3], 0x895cd7be); | |
STEP (a, b, w3[0], 0x6b901122); | |
STEP (d, a, w3[1], 0xfd987193); | |
STEP (c, d, w3[2], 0xa679438e); | |
STEP (b, c, w3[3], 0x49b40821); | |
STEP (a, b, w0[1], 0xd76aa478); | |
STEP (d, a, w1[2], 0xe8c7b756); | |
STEP (c, d, w2[3], 0x242070db); | |
STEP (b, c, w0[0], 0xc1bdceee); | |
STEP (a, b, w1[1], 0xf57c0faf); | |
STEP (d, a, w2[2], 0x4787c62a); | |
STEP (c, d, w3[3], 0xa8304613); | |
STEP (b, c, w1[0], 0xfd469501); | |
STEP (a, b, w2[1], 0x698098d8); | |
STEP (d, a, w3[2], 0x8b44f7af); | |
STEP (c, d, w0[3], 0xffff5bb1); | |
STEP (b, c, w2[0], 0x895cd7be); | |
STEP (a, b, w3[1], 0x6b901122); | |
STEP (d, a, w0[2], 0xfd987193); | |
STEP (c, d, w1[3], 0xa679438e); | |
STEP (b, c, w3[0], 0x49b40821); | |
STEP (a, b, w1[1], 0xd76aa478); | |
STEP (d, a, w2[0], 0xe8c7b756); | |
STEP (c, d, w2[3], 0x242070db); | |
STEP (b, c, w3[2], 0xc1bdceee); | |
STEP (a, b, w0[1], 0xf57c0faf); | |
STEP (d, a, w1[0], 0x4787c62a); | |
STEP (c, d, w1[3], 0xa8304613); | |
STEP (b, c, w2[2], 0xfd469501); | |
STEP (a, b, w3[1], 0x698098d8); | |
STEP (d, a, w0[0], 0x8b44f7af); | |
STEP (c, d, w0[3], 0xffff5bb1); | |
STEP (b, c, w1[2], 0x895cd7be); | |
STEP (a, b, w2[1], 0x6b901122); | |
STEP (d, a, w3[0], 0xfd987193); | |
STEP (c, d, w3[3], 0xa679438e); | |
STEP (b, c, w0[2], 0x49b40821); | |
STEP (a, b, w0[0], 0xd76aa478); | |
STEP (d, a, w1[3], 0xe8c7b756); | |
STEP (c, d, w3[2], 0x242070db); | |
STEP (b, c, w1[1], 0xc1bdceee); | |
STEP (a, b, w3[0], 0xf57c0faf); | |
STEP (d, a, w0[3], 0x4787c62a); | |
STEP (c, d, w2[2], 0xa8304613); | |
STEP (b, c, w0[1], 0xfd469501); | |
STEP (a, b, w2[0], 0x698098d8); | |
STEP (d, a, w3[3], 0x8b44f7af); | |
STEP (c, d, w1[2], 0xffff5bb1); | |
STEP (b, c, w3[1], 0x895cd7be); | |
STEP (a, b, w1[0], 0x6b901122); | |
STEP (d, a, w2[3], 0xfd987193); | |
STEP (c, d, w0[2], 0xa679438e); | |
STEP (b, c, w2[1], 0x49b40821); | |
h[0] += a; | |
h[1] += b; | |
h[2] += c; | |
h[3] += d; | |
} | |
__device__ void s (int *in, int *out) | |
{ | |
int len = in[0]; | |
int *buf = in; | |
for (int i = 0; i < 2; i++) | |
{ | |
int w0[4] = { i }; | |
int w1[4] = { 0 }; | |
int w2[4] = { 0 }; | |
int w3[4] = { 0 }; | |
int ipad[4] = { 0 }; | |
int opad[4] = { 0 }; | |
t (w0, w0, w0, w0, ipad); | |
t (w0, w0, w0, w0, opad); | |
w0[0] = buf[0]; | |
w0[1] = buf[0]; | |
w0[2] = buf[0]; | |
w0[3] = len; | |
int h[4]; | |
t (w0, w0, w0, w0, h); | |
w0[0] = h[0]; | |
w0[1] = h[1]; | |
w0[2] = h[2]; | |
w0[3] = h[3]; | |
ipad[0] = 1; | |
ipad[1] = 1; | |
ipad[2] = 1; | |
ipad[3] = 1; | |
opad[0] = 1; | |
opad[1] = 1; | |
opad[2] = 1; | |
opad[3] = 1; | |
t (w0, w0, w0, w0, ipad); | |
t (w0, w0, w0, w0, opad); | |
int left; | |
int off; | |
for (left = len, off = 0; left >= 56; left -= 64, off += 16) | |
{ | |
w0[0] = buf[off + 0]; | |
w0[1] = buf[off + 1]; | |
w0[2] = buf[off + 2]; | |
w0[3] = buf[off + 3]; | |
w1[0] = buf[off + 4]; | |
w1[1] = buf[off + 5]; | |
w1[2] = buf[off + 6]; | |
w1[3] = buf[off + 7]; | |
w2[0] = buf[off + 8]; | |
w2[1] = buf[off + 9]; | |
w2[2] = buf[off + 10]; | |
w2[3] = buf[off + 11]; | |
w3[0] = buf[off + 12]; | |
w3[1] = buf[off + 13]; | |
w3[2] = buf[off + 14]; | |
w3[3] = buf[off + 15]; | |
t (w0, w1, w2, w3, ipad); | |
} | |
w0[0] = buf[off + 0]; | |
w0[1] = buf[off + 1]; | |
w0[2] = buf[off + 2]; | |
w0[3] = buf[off + 3]; | |
w1[0] = buf[off + 4]; | |
w1[1] = buf[off + 5]; | |
w1[2] = buf[off + 6]; | |
w1[3] = buf[off + 7]; | |
w2[0] = buf[off + 8]; | |
w2[1] = buf[off + 9]; | |
w2[2] = buf[off + 10]; | |
w2[3] = buf[off + 11]; | |
w3[0] = buf[off + 12]; | |
w3[1] = buf[off + 13]; | |
w3[2] = len; | |
w3[3] = 0; | |
h[0] = ipad[0]; | |
h[1] = ipad[1]; | |
h[2] = ipad[2]; | |
h[3] = ipad[3]; | |
t (w0, w1, w2, w3, h); | |
w0[0] = h[0]; | |
w0[1] = h[1]; | |
w0[2] = h[2]; | |
w0[3] = h[3]; | |
h[0] = opad[0]; | |
h[1] = opad[1]; | |
h[2] = opad[2]; | |
h[3] = opad[3]; | |
t (w0, w0, w0, w0, h); | |
if (h[0] == 0) out[0] = 1; | |
} | |
} | |
// The main() function is just to demonstrate the issue is a system-wide issue in NVCC, | |
// and is not caused by any extra options used when calling NVCC. | |
// For easier debugging you can just compile into -cubin to trigger the same issue | |
int main () {} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment