allanmac/bfe64.cu

## bfe64.cu
#include <stdio.h>

//
//
//

#define DEVICE_INTRINSIC_QUALIFIERS   __device__ __forceinline__

//
//
//

#define S2V_B64(s,v)   asm("mov.b64 {%0,%1}, %2;" : "=r"(v##.x), "=r"(v##.y) : "l"(s))

DEVICE_INTRINSIC_QUALIFIERS
unsigned int
bfe64(const unsigned long long src,
      const unsigned int       startBit,
      const unsigned int       numBits)
{
#if __CUDA_ARCH__ >= 350

  unsigned int bits;
  uint2        ab;

  S2V_B64(src,ab);

  asm("shf.r.clamp.b32 %0, %1, %2, %3;" :
    "=r"(bits) : "r"(ab.x), "r"(ab.y), "r"(startBit));
  return bits & ((1<<numBits)-1);

#elif __CUDA_ARCH__ >= 200

  unsigned long long bits;
  asm("bfe.u64 %0, %1, %2, %3;" :
      "=l"(bits) : "l"(src), "r"(startBit), "r"(numBits));
  return (unsigned int)bits;

#else

  const unsigned int MASK = (1 << numBits) - 1;
  return (src >> startBit) & MASK;

#endif
}

//
//
//
__global__
void
bfe64Kernel(const unsigned long long* const inB64, unsigned int* const outB32)
{
  const unsigned long long ab = inB64[threadIdx.x];
  const unsigned int       d  = bfe64(ab,25,10);

  outB32[threadIdx.x] = d;
}

//
//
//

int main(int argc, char** argv)
{
  const int device = (argc == 2) ? atoi(argv[1]) : 0;

  cudaDeviceProp props;
  cudaGetDeviceProperties(&props,device);

  printf("%s (%2d)\n",props.name,props.multiProcessorCount);

  cudaSetDevice(device);

  unsigned long long* inB64;
  unsigned int*       outB32;

  cudaMalloc(&inB64, sizeof(unsigned long long));
  cudaMalloc(&outB32,sizeof(unsigned int));

#define VAL 0xFEEDFACEDEADBEEFL

  const unsigned long long valB64[] = { VAL };

  cudaMemcpy(inB64,valB64,sizeof(unsigned long long),cudaMemcpyHostToDevice);

  //
  //
  //

  bfe64Kernel<<<1,1>>>(inB64,outB32);

  cudaDeviceSynchronize();

  //
  //
  //

  unsigned int val32[1];

  cudaMemcpy(val32,outB32,sizeof(unsigned int),cudaMemcpyDeviceToHost);

  printf("%16llX\n",VAL);
  printf("%16X\n",val32[0]);

  //
  //
  //

  cudaFree(inB64);
  cudaFree(outB32);

  cudaDeviceReset();

  return 0;
}
	#include <stdio.h>

	//
	//
	//

	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__

	//
	//
	//

	#define S2V_B64(s,v) asm("mov.b64 {%0,%1}, %2;" : "=r"(v##.x), "=r"(v##.y) : "l"(s))

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int
	bfe64(const unsigned long long src,
	const unsigned int startBit,
	const unsigned int numBits)
	{
	#if __CUDA_ARCH__ >= 350

	unsigned int bits;
	uint2 ab;

	S2V_B64(src,ab);

	asm("shf.r.clamp.b32 %0, %1, %2, %3;" :
	"=r"(bits) : "r"(ab.x), "r"(ab.y), "r"(startBit));
	return bits & ((1<<numBits)-1);

	#elif __CUDA_ARCH__ >= 200

	unsigned long long bits;
	asm("bfe.u64 %0, %1, %2, %3;" :
	"=l"(bits) : "l"(src), "r"(startBit), "r"(numBits));
	return (unsigned int)bits;

	#else

	const unsigned int MASK = (1 << numBits) - 1;
	return (src >> startBit) & MASK;

	#endif
	}

	//
	//
	//
	__global__
	void
	bfe64Kernel(const unsigned long long* const inB64, unsigned int* const outB32)
	{
	const unsigned long long ab = inB64[threadIdx.x];
	const unsigned int d = bfe64(ab,25,10);

	outB32[threadIdx.x] = d;
	}

	//
	//
	//

	int main(int argc, char** argv)
	{
	const int device = (argc == 2) ? atoi(argv[1]) : 0;

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props,device);

	printf("%s (%2d)\n",props.name,props.multiProcessorCount);

	cudaSetDevice(device);

	unsigned long long* inB64;
	unsigned int* outB32;

	cudaMalloc(&inB64, sizeof(unsigned long long));
	cudaMalloc(&outB32,sizeof(unsigned int));

	#define VAL 0xFEEDFACEDEADBEEFL

	const unsigned long long valB64[] = { VAL };

	cudaMemcpy(inB64,valB64,sizeof(unsigned long long),cudaMemcpyHostToDevice);

	//
	//
	//

	bfe64Kernel<<<1,1>>>(inB64,outB32);

	cudaDeviceSynchronize();

	//
	//
	//

	unsigned int val32[1];

	cudaMemcpy(val32,outB32,sizeof(unsigned int),cudaMemcpyDeviceToHost);

	printf("%16llX\n",VAL);
	printf("%16X\n",val32[0]);

	//
	//
	//

	cudaFree(inB64);
	cudaFree(outB32);

	cudaDeviceReset();

	return 0;
	}