Last active
September 21, 2023 22:59
-
-
Save allanmac/5166783 to your computer and use it in GitHub Desktop.
Experiments with shfl.idx/up/down to see how negative indices or offsets are handled. The shuffled value and its predicate are returned.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define KERNEL_QUALIFIERS extern "C" __global__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
// | |
// | |
// | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int laneId() | |
{ | |
unsigned int id; | |
asm("mov.u32 %0, %%laneid;" : "=r"(id)); | |
return id; | |
} | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int | |
shfl_rot(const unsigned int v, const int rot, bool* const flag) | |
{ | |
const unsigned int mask = 0 | 0x1F; | |
unsigned int r; | |
unsigned int f; | |
asm(".reg .pred p;" | |
"shfl.idx.b32 %0|p, %2, %3, %4;" | |
"selp.u32 %1,1,0,p;" | |
: "=r"(r), "=r"(f) : "r"(v), "r"(laneId()-rot), "r"(mask)); | |
*flag = f; | |
return r; | |
} | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int | |
shfl_ror(const unsigned int v, const unsigned int ror, bool* const flag) | |
{ | |
const unsigned int mask = 0 | 0; | |
unsigned int r; | |
unsigned int f; | |
asm(".reg .pred p;" | |
"shfl.up.b32 %0|p, %2, %3, %4;" | |
"selp.u32 %1,1,0,p;" | |
: "=r"(r), "=r"(f) : "r"(v), "r"(ror), "r"(mask)); | |
*flag = f; | |
return r; | |
} | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int | |
shfl_rol(const unsigned int v, const unsigned int rol, bool* const flag) | |
{ | |
const unsigned int mask = 0 | 0x1F; | |
unsigned int r; | |
unsigned int f; | |
asm(".reg .pred p;" | |
"shfl.down.b32 %0|p, %2, %3, %4;" | |
"selp.u32 %1,1,0,p;" | |
: "=r"(r), "=r"(f) : "r"(v), "r"(rol), "r"(mask)); | |
*flag = f; | |
return r; | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS | |
void shflRotKernel(const unsigned int* const vin, | |
unsigned int* const vout, | |
bool* const pout, | |
const int rot) | |
{ | |
unsigned int v = vin[threadIdx.x]; | |
v = shfl_rot(v,rot,pout+threadIdx.x); | |
vout[threadIdx.x] = v; | |
} | |
KERNEL_QUALIFIERS | |
void shflRorKernel(const unsigned int* const vin, | |
unsigned int* const vout, | |
bool* const pout, | |
const int rot) | |
{ | |
unsigned int v = vin[threadIdx.x]; | |
v = shfl_ror(v,rot,pout+threadIdx.x); | |
vout[threadIdx.x] = v; | |
} | |
KERNEL_QUALIFIERS | |
void shflRolKernel(const unsigned int* const vin, | |
unsigned int* const vout, | |
bool* const pout, | |
const int rot) | |
{ | |
unsigned int v = vin[threadIdx.x]; | |
v = shfl_rol(v,rot,pout+threadIdx.x); | |
vout[threadIdx.x] = v; | |
} | |
// | |
// | |
// | |
void printRing(const unsigned int* const ring, const bool* const pred) | |
{ | |
for (int ii=0; ii<WARP_SIZE; ii++) | |
printf("%2d ",ring[ii]); | |
printf("\n"); | |
for (int ii=0; ii<WARP_SIZE; ii++) | |
printf("%s ",pred[ii] ? " ." : " x"); | |
printf("\n"); | |
} | |
// | |
// | |
// | |
int main(int argc, char** argv) | |
{ | |
int rot = (argc == 1) ? 5 : atoi(argv[1]); | |
int device = (argc == 3) ? atoi(argv[2]) : 0; | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
if (props.major < 3) // need SHFL | |
{ | |
printf("requires sm_30\n"); | |
return -1; | |
} | |
cudaSetDevice(device); | |
// | |
// LAUNCH KERNEL | |
// | |
unsigned int* vin; | |
unsigned int* vout; | |
bool* pout; | |
const unsigned int ring[WARP_SIZE] = | |
{ | |
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | |
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | |
}; | |
const bool pred[WARP_SIZE] = | |
{ | |
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false | |
}; | |
cudaMalloc(&vin, sizeof(ring)); | |
cudaMalloc(&vout,sizeof(ring)); | |
cudaMalloc(&pout,sizeof(pred)); | |
cudaMemcpy(vin, ring,sizeof(ring),cudaMemcpyHostToDevice); | |
cudaMemcpy(pout,pred,sizeof(pred),cudaMemcpyHostToDevice); | |
// | |
// | |
// | |
unsigned int ringRot[WARP_SIZE]; | |
bool predRot[WARP_SIZE]; | |
// | |
// | |
// | |
shflRotKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot); | |
cudaDeviceSynchronize(); | |
cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost); | |
cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost); | |
printf("\nshfl.idx (rot): %d\n",rot); | |
printRing(ringRot,predRot); | |
// | |
// | |
// | |
shflRorKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot); | |
cudaDeviceSynchronize(); | |
cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost); | |
cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost); | |
printf("\nshfl.up (ror): %d\n",rot); | |
printRing(ringRot,predRot); | |
// | |
// | |
// | |
shflRolKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot); | |
cudaDeviceSynchronize(); | |
cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost); | |
cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost); | |
printf("\nshfl.down (rol): %d\n",rot); | |
printRing(ringRot,predRot); | |
// | |
// | |
// | |
cudaFree(vin); | |
cudaFree(vout); | |
cudaFree(pout); | |
cudaDeviceReset(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compiled with:
nvcc -m 32 -arch sm_30 shflrot.cu
As you can see, negative offsets are handled by
shfl.idx
but are masked into unsigned values byshfl.up
andshfl.down
.