Last active
December 16, 2015 23:29
-
-
Save allanmac/5514436 to your computer and use it in GitHub Desktop.
What is the best way to extract up to 32 bits that straddle the 32-bit boundary of a 64-bit word given a constant starting position and number of bits? On sm_35 the SHF.R.CLAMP opcode can accomplish this in two instructions. For sm_12-sm_30 devices as many as four instructions are required.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// | |
// | |
// | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
// | |
// | |
// | |
#define S2V_B64(s,v) asm("mov.b64 {%0,%1}, %2;" : "=r"(v##.x), "=r"(v##.y) : "l"(s)) | |
DEVICE_INTRINSIC_QUALIFIERS | |
unsigned int | |
bfe64(const unsigned long long src, | |
const unsigned int startBit, | |
const unsigned int numBits) | |
{ | |
#if __CUDA_ARCH__ >= 350 | |
unsigned int bits; | |
uint2 ab; | |
S2V_B64(src,ab); | |
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : | |
"=r"(bits) : "r"(ab.x), "r"(ab.y), "r"(startBit)); | |
return bits & ((1<<numBits)-1); | |
#elif __CUDA_ARCH__ >= 200 | |
unsigned long long bits; | |
asm("bfe.u64 %0, %1, %2, %3;" : | |
"=l"(bits) : "l"(src), "r"(startBit), "r"(numBits)); | |
return (unsigned int)bits; | |
#else | |
const unsigned int MASK = (1 << numBits) - 1; | |
return (src >> startBit) & MASK; | |
#endif | |
} | |
// | |
// | |
// | |
__global__ | |
void | |
bfe64Kernel(const unsigned long long* const inB64, unsigned int* const outB32) | |
{ | |
const unsigned long long ab = inB64[threadIdx.x]; | |
const unsigned int d = bfe64(ab,25,10); | |
outB32[threadIdx.x] = d; | |
} | |
// | |
// | |
// | |
int main(int argc, char** argv) | |
{ | |
const int device = (argc == 2) ? atoi(argv[1]) : 0; | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
cudaSetDevice(device); | |
unsigned long long* inB64; | |
unsigned int* outB32; | |
cudaMalloc(&inB64, sizeof(unsigned long long)); | |
cudaMalloc(&outB32,sizeof(unsigned int)); | |
#define VAL 0xFEEDFACEDEADBEEFL | |
const unsigned long long valB64[] = { VAL }; | |
cudaMemcpy(inB64,valB64,sizeof(unsigned long long),cudaMemcpyHostToDevice); | |
// | |
// | |
// | |
bfe64Kernel<<<1,1>>>(inB64,outB32); | |
cudaDeviceSynchronize(); | |
// | |
// | |
// | |
unsigned int val32[1]; | |
cudaMemcpy(val32,outB32,sizeof(unsigned int),cudaMemcpyDeviceToHost); | |
printf("%16llX\n",VAL); | |
printf("%16X\n",val32[0]); | |
// | |
// | |
// | |
cudaFree(inB64); | |
cudaFree(outB32); | |
cudaDeviceReset(); | |
return 0; | |
} |
sm_20,sm_21,sm_30:
/*0048*/ /*0x64205c035800c000*/ SHR.U32 R1, R2, 0x19;
/*0050*/ /*0x04305ce340000000*/ ISCADD R1, R3, R1, 0x7;
/*0058*/ /*0xfc00dde428000000*/ MOV R3, RZ; <-- probably redundant due to bfe.u64
/*0060*/ /*0xfc109c036800c00f*/ LOP.AND R2, R1, 0x3ff;
sm_12:
/*0018*/ /*0x30190405e4100780*/ SHR R1, R2, 0x19;
/*0020*/ /*0x30070609c4100780*/ SHL R2, R3, 0x7;
/*0028*/ /*0xd002020504004780*/ LOP.OR R1, R1, R2;
/*0050*/ /*0xd081020104400780*/ LOP.AND R0, R1, c [0x1] [0x1];
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compiled with
nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin bfe64.cu
sm_35: