Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active December 16, 2015 23:29
Show Gist options
  • Save allanmac/5514436 to your computer and use it in GitHub Desktop.
Save allanmac/5514436 to your computer and use it in GitHub Desktop.
What is the best way to extract up to 32 bits that straddle the 32-bit boundary of a 64-bit word given a constant starting position and number of bits? On sm_35 the SHF.R.CLAMP opcode can accomplish this in two instructions. For sm_12-sm_30 devices as many as four instructions are required.
#include <stdio.h>
//
//
//
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
//
//
//
#define S2V_B64(s,v) asm("mov.b64 {%0,%1}, %2;" : "=r"(v##.x), "=r"(v##.y) : "l"(s))
DEVICE_INTRINSIC_QUALIFIERS
unsigned int
bfe64(const unsigned long long src,
const unsigned int startBit,
const unsigned int numBits)
{
#if __CUDA_ARCH__ >= 350
unsigned int bits;
uint2 ab;
S2V_B64(src,ab);
asm("shf.r.clamp.b32 %0, %1, %2, %3;" :
"=r"(bits) : "r"(ab.x), "r"(ab.y), "r"(startBit));
return bits & ((1<<numBits)-1);
#elif __CUDA_ARCH__ >= 200
unsigned long long bits;
asm("bfe.u64 %0, %1, %2, %3;" :
"=l"(bits) : "l"(src), "r"(startBit), "r"(numBits));
return (unsigned int)bits;
#else
const unsigned int MASK = (1 << numBits) - 1;
return (src >> startBit) & MASK;
#endif
}
//
//
//
__global__
void
bfe64Kernel(const unsigned long long* const inB64, unsigned int* const outB32)
{
const unsigned long long ab = inB64[threadIdx.x];
const unsigned int d = bfe64(ab,25,10);
outB32[threadIdx.x] = d;
}
//
//
//
int main(int argc, char** argv)
{
const int device = (argc == 2) ? atoi(argv[1]) : 0;
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
cudaSetDevice(device);
unsigned long long* inB64;
unsigned int* outB32;
cudaMalloc(&inB64, sizeof(unsigned long long));
cudaMalloc(&outB32,sizeof(unsigned int));
#define VAL 0xFEEDFACEDEADBEEFL
const unsigned long long valB64[] = { VAL };
cudaMemcpy(inB64,valB64,sizeof(unsigned long long),cudaMemcpyHostToDevice);
//
//
//
bfe64Kernel<<<1,1>>>(inB64,outB32);
cudaDeviceSynchronize();
//
//
//
unsigned int val32[1];
cudaMemcpy(val32,outB32,sizeof(unsigned int),cudaMemcpyDeviceToHost);
printf("%16llX\n",VAL);
printf("%16X\n",val32[0]);
//
//
//
cudaFree(inB64);
cudaFree(outB32);
cudaDeviceReset();
return 0;
}
@allanmac
Copy link
Author

allanmac commented May 3, 2013

Compiled with nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin bfe64.cu

sm_35:

    /*0028*/     /*0x0c9c0809c7c00c00*/     SHF.R R2, R2, 0x19, R3;
    /*0030*/     /*0xff9c0805c2000001*/     LOP.AND R1, R2, 0x3ff;

@allanmac
Copy link
Author

allanmac commented May 3, 2013

sm_20,sm_21,sm_30:

    /*0048*/     /*0x64205c035800c000*/     SHR.U32 R1, R2, 0x19;
    /*0050*/     /*0x04305ce340000000*/     ISCADD R1, R3, R1, 0x7;
    /*0058*/     /*0xfc00dde428000000*/     MOV R3, RZ;  <-- probably redundant due to bfe.u64
    /*0060*/     /*0xfc109c036800c00f*/     LOP.AND R2, R1, 0x3ff;

@allanmac
Copy link
Author

allanmac commented May 3, 2013

sm_12:

    /*0018*/     /*0x30190405e4100780*/     SHR R1, R2, 0x19;
    /*0020*/     /*0x30070609c4100780*/     SHL R2, R3, 0x7;
    /*0028*/     /*0xd002020504004780*/     LOP.OR R1, R1, R2;
    /*0050*/     /*0xd081020104400780*/     LOP.AND R0, R1, c [0x1] [0x1];

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment