Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active September 21, 2023 22:59
Show Gist options
  • Save allanmac/5166783 to your computer and use it in GitHub Desktop.
Save allanmac/5166783 to your computer and use it in GitHub Desktop.
Experiments with shfl.idx/up/down to see how negative indices or offsets are handled. The shuffled value and its predicate are returned.
#include <stdio.h>
//
//
//
#define WARP_SIZE 32
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
//
//
//
DEVICE_INTRINSIC_QUALIFIERS
unsigned int laneId()
{
unsigned int id;
asm("mov.u32 %0, %%laneid;" : "=r"(id));
return id;
}
DEVICE_INTRINSIC_QUALIFIERS
unsigned int
shfl_rot(const unsigned int v, const int rot, bool* const flag)
{
const unsigned int mask = 0 | 0x1F;
unsigned int r;
unsigned int f;
asm(".reg .pred p;"
"shfl.idx.b32 %0|p, %2, %3, %4;"
"selp.u32 %1,1,0,p;"
: "=r"(r), "=r"(f) : "r"(v), "r"(laneId()-rot), "r"(mask));
*flag = f;
return r;
}
DEVICE_INTRINSIC_QUALIFIERS
unsigned int
shfl_ror(const unsigned int v, const unsigned int ror, bool* const flag)
{
const unsigned int mask = 0 | 0;
unsigned int r;
unsigned int f;
asm(".reg .pred p;"
"shfl.up.b32 %0|p, %2, %3, %4;"
"selp.u32 %1,1,0,p;"
: "=r"(r), "=r"(f) : "r"(v), "r"(ror), "r"(mask));
*flag = f;
return r;
}
DEVICE_INTRINSIC_QUALIFIERS
unsigned int
shfl_rol(const unsigned int v, const unsigned int rol, bool* const flag)
{
const unsigned int mask = 0 | 0x1F;
unsigned int r;
unsigned int f;
asm(".reg .pred p;"
"shfl.down.b32 %0|p, %2, %3, %4;"
"selp.u32 %1,1,0,p;"
: "=r"(r), "=r"(f) : "r"(v), "r"(rol), "r"(mask));
*flag = f;
return r;
}
//
//
//
KERNEL_QUALIFIERS
void shflRotKernel(const unsigned int* const vin,
unsigned int* const vout,
bool* const pout,
const int rot)
{
unsigned int v = vin[threadIdx.x];
v = shfl_rot(v,rot,pout+threadIdx.x);
vout[threadIdx.x] = v;
}
KERNEL_QUALIFIERS
void shflRorKernel(const unsigned int* const vin,
unsigned int* const vout,
bool* const pout,
const int rot)
{
unsigned int v = vin[threadIdx.x];
v = shfl_ror(v,rot,pout+threadIdx.x);
vout[threadIdx.x] = v;
}
KERNEL_QUALIFIERS
void shflRolKernel(const unsigned int* const vin,
unsigned int* const vout,
bool* const pout,
const int rot)
{
unsigned int v = vin[threadIdx.x];
v = shfl_rol(v,rot,pout+threadIdx.x);
vout[threadIdx.x] = v;
}
//
//
//
void printRing(const unsigned int* const ring, const bool* const pred)
{
for (int ii=0; ii<WARP_SIZE; ii++)
printf("%2d ",ring[ii]);
printf("\n");
for (int ii=0; ii<WARP_SIZE; ii++)
printf("%s ",pred[ii] ? " ." : " x");
printf("\n");
}
//
//
//
int main(int argc, char** argv)
{
int rot = (argc == 1) ? 5 : atoi(argv[1]);
int device = (argc == 3) ? atoi(argv[2]) : 0;
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
if (props.major < 3) // need SHFL
{
printf("requires sm_30\n");
return -1;
}
cudaSetDevice(device);
//
// LAUNCH KERNEL
//
unsigned int* vin;
unsigned int* vout;
bool* pout;
const unsigned int ring[WARP_SIZE] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
};
const bool pred[WARP_SIZE] =
{
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false
};
cudaMalloc(&vin, sizeof(ring));
cudaMalloc(&vout,sizeof(ring));
cudaMalloc(&pout,sizeof(pred));
cudaMemcpy(vin, ring,sizeof(ring),cudaMemcpyHostToDevice);
cudaMemcpy(pout,pred,sizeof(pred),cudaMemcpyHostToDevice);
//
//
//
unsigned int ringRot[WARP_SIZE];
bool predRot[WARP_SIZE];
//
//
//
shflRotKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);
cudaDeviceSynchronize();
cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);
printf("\nshfl.idx (rot): %d\n",rot);
printRing(ringRot,predRot);
//
//
//
shflRorKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);
cudaDeviceSynchronize();
cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);
printf("\nshfl.up (ror): %d\n",rot);
printRing(ringRot,predRot);
//
//
//
shflRolKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);
cudaDeviceSynchronize();
cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);
printf("\nshfl.down (rol): %d\n",rot);
printRing(ringRot,predRot);
//
//
//
cudaFree(vin);
cudaFree(vout);
cudaFree(pout);
cudaDeviceReset();
return 0;
}
@allanmac
Copy link
Author

Compiled with: nvcc -m 32 -arch sm_30 shflrot.cu

As you can see, negative offsets are handled by shfl.idx but are masked into unsigned values by shfl.up and shfl.down.

----------------------------------------------------------------------------------------------- 

>>a 5
Tesla K20c (13)

shfl.idx  (rot): 5
27 28 29 30 31  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
 .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 

shfl.up   (ror): 5
 0  1  2  3  4  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
 x  x  x  x  x  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 

shfl.down (rol): 5
 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 27 28 29 30 31 
 .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  x  x  x  x  x 

----------------------------------------------------------------------------------------------- 

>>a -5
Tesla K20c (13)

shfl.idx  (rot): -5
 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31  0  1  2  3  4 
 .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  . 

shfl.up   (ror): -5
 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26  0  1  2  3  4 
 x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  .  .  .  .  . 

shfl.down (rol): -5
27 28 29 30 31  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 
 .  .  .  .  .  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x 

-----------------------------------------------------------------------------------------------  

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment