Last active
August 29, 2015 14:12
Acquire the indices of all set bits in a 32-bit word. For Fermi and Kepler the indices will be captured from MSB to LSB. For Maxwell the indices are ordered from LSB to MSB.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 32 -arch compute_20 -Xptxas=-v,-abi=no -cubin kth.cu"; -*- | |
#include <stdio.h> | |
#include <stdint.h> | |
// | |
// | |
// | |
typedef uint8_t u8; | |
typedef uint16_t u16; | |
typedef uint32_t u32; | |
typedef int32_t s32; | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define KERNEL_QUALIFIERS extern "C" __global__ | |
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static __device__ __forceinline__ | |
#define RESTRICT __restrict__ | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
warp_lane() | |
{ | |
u32 id; | |
asm("mov.u32 %0, %%laneid;" : "=r"(id)); | |
return id; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
warp_lane_mask_eq() | |
{ | |
#if __CUDA_ARCH__ >= 200 | |
u32 id; | |
asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id)); | |
return id; | |
#else | |
return 1u << warp_lane(); | |
#endif | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
warp_lane_mask_lt() | |
{ | |
#if __CUDA_ARCH__ >= 200 | |
u32 id; | |
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(id)); | |
return id; | |
#else | |
return (1u << warp_lane()) - 1u; | |
#endif | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
warp_lane_mask_lte() | |
{ | |
#if __CUDA_ARCH__ >= 200 | |
u32 id; | |
asm("mov.u32 %0, %%lanemask_le;" : "=r"(id)); | |
return id; | |
#else | |
return (2u << warp_lane()) - 1u; | |
#endif | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
unsigned int | |
warp_lane_mask_gt() | |
{ | |
#if __CUDA_ARCH__ >= 200 | |
u32 id; | |
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(id)); | |
return id; | |
#else | |
return ~((2u << warp_lane()) - 1u); | |
#endif | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
warp_lane_mask_gte() | |
{ | |
#if __CUDA_ARCH__ >= 200 | |
u32 id; | |
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(id)); | |
return id; | |
#else | |
return ~((1u << warp_lane()) - 1u); | |
#endif | |
} | |
// | |
// | |
// | |
#define TYPE_SCRATCH u32 | |
// | |
// | |
// | |
__shared__ volatile TYPE_SCRATCH scratch[WARP_SIZE+1]; // volatile is necessary | |
KERNEL_QUALIFIERS | |
void | |
kth_kernel(const u32 bits, TYPE_SCRATCH* const RESTRICT vout) | |
{ | |
// init scratch | |
scratch[warp_lane()] = UINT8_MAX; | |
#if (__CUDA_ARCH__ >= 500) | |
// MAXWELL: IT APPEARS "LOW LANE WINS" BUT NOT ALWAYS ;) | |
// this is a more reliable and portable implementation: | |
if ((bits & warp_lane_mask_eq()) != 0) | |
scratch[__popc(bits & warp_lane_mask_lt())] = warp_lane(); | |
#else | |
// FERMI/KEPLER: HIGH LANE WINS | |
// count all bits from lane to MSB | |
const u32 count = __popc(bits & warp_lane_mask_gte()); | |
// subtract 1 and clamp to last index in scratch | |
const u32 idx = min(count-1u,WARP_SIZE); | |
// save using "high/low lane wins" feature | |
scratch[idx] = warp_lane(); | |
#endif | |
// store | |
vout[warp_lane()] = scratch[warp_lane()]; | |
} | |
// | |
// | |
// | |
int | |
main(int argc, char** argv) | |
{ | |
const s32 device = (argc == 1) ? 0 : atoi(argv[1]); | |
const u32 bits = (argc <= 2) ? 0x55555555 : atoi(argv[2]); | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
printf("0x%X\n",bits); | |
cudaSetDevice(device); | |
// | |
// | |
// | |
TYPE_SCRATCH* vout_d; | |
cudaMalloc(&vout_d,WARP_SIZE * sizeof(TYPE_SCRATCH)); | |
// | |
// | |
// | |
kth_kernel<<<1,WARP_SIZE>>>(bits,vout_d); | |
cudaDeviceSynchronize(); | |
// | |
// | |
// | |
TYPE_SCRATCH vout_h[WARP_SIZE]; | |
cudaMemcpy(vout_h,vout_d, | |
WARP_SIZE*sizeof(TYPE_SCRATCH), | |
cudaMemcpyDeviceToHost); | |
// | |
// | |
// | |
for (u32 ii=0; ii<WARP_SIZE; ii++) | |
printf("%3u ",(u32)vout_h[ii]); | |
printf("\n"); | |
// | |
// | |
cudaDeviceReset(); | |
return 0; | |
} |
Author
allanmac
commented
Dec 29, 2014
> kth
GeForce GTX 750 Ti ( 5)
0x55555555
0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
> kth 1
GeForce GT 545 ( 3)
0x55555555
30 28 26 24 22 20 18 16 14 12 10 8 6 4 2 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
> kth 2
GeForce GT 630 ( 2)
0x55555555
30 28 26 24 22 20 18 15 14 12 10 8 6 4 2 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
> kth 3
GeForce GTX 680 ( 8)
0x55555555
30 28 26 24 22 20 18 15 14 12 10 8 6 4 2 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
On sm_50 it works out to ~7 SASS instructions after loading the special registers:
S2R R0, SR_EQMASK;
S2R R2, SR_LANEID;
LOP.AND R0, R0, c[0x0][0x140];
ISETP.EQ.AND P0, PT, R0, RZ, PT;
MOV32I R0, 0xff;
STS.U8 [R2], R0;
S2R R0, SR_LTMASK;
LOP.AND R0, R0, c[0x0][0x140];
POPC R0, R0;
STS.U8 [R0], R2;
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment