Skip to content

Instantly share code, notes, and snippets.

@snakers4
Last active January 28, 2020 10:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save snakers4/f5dda917ef0f46bdbf43ef3916def86e to your computer and use it in GitHub Desktop.
Save snakers4/f5dda917ef0f46bdbf43ef3916def86e to your computer and use it in GitHub Desktop.
warp_ctc_patch_cuda_10_1
diff --git a/include/contrib/moderngpu/include/device/intrinsics.cuh b/include/contrib/moderngpu/include/device/intrinsics.cuh
index a601443..c212a38 100644
--- a/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
@@ -112,7 +112,7 @@ __device__ __forceinline__ float shfl_up(float var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
- var = __shfl_up(var, delta, width);
+ var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
#endif
return var;
}
@@ -122,8 +122,8 @@ __device__ __forceinline__ double shfl_up(double var,
#if __CUDA_ARCH__ >= 300
int2 p = mgpu::double_as_int2(var);
- p.x = __shfl_up(p.x, delta, width);
- p.y = __shfl_up(p.y, delta, width);
+ p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+ p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
var = mgpu::int2_as_double(p);
#endif
@@ -140,7 +140,7 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
asm(
"{.reg .s32 r0;"
".reg .pred p;"
- "shfl.up.b32 r0|p, %1, %2, %3;"
+ "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
"@p add.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
@@ -155,7 +155,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
asm(
"{.reg .s32 r0;"
".reg .pred p;"
- "shfl.up.b32 r0|p, %1, %2, %3;"
+ "shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
"@p max.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
diff --git a/src/reduce.cu b/src/reduce.cu
index df7b3af..7a428f8 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -41,7 +41,7 @@ struct CTAReduce {
T shuff;
for (int offset = warp_size / 2; offset > 0; offset /= 2) {
- shuff = __shfl_down(x, offset);
+ shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
if (tid + offset < count && tid < offset)
x = g(x, shuff);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment