Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active August 29, 2015 14:12
Embed
What would you like to do?
Acquire the indices of all set bits in a 32-bit word. For Fermi and Kepler the indices will be captured from MSB to LSB. For Maxwell the indices are ordered from LSB to MSB.
// -*- compile-command: "nvcc -m 32 -arch compute_20 -Xptxas=-v,-abi=no -cubin kth.cu"; -*-
#include <stdio.h>
#include <stdint.h>
//
//
//
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef int32_t s32;
//
//
//
#define WARP_SIZE 32
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static __device__ __forceinline__
#define RESTRICT __restrict__
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane()
{
u32 id;
asm("mov.u32 %0, %%laneid;" : "=r"(id));
return id;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_eq()
{
#if __CUDA_ARCH__ >= 200
u32 id;
asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id));
return id;
#else
return 1u << warp_lane();
#endif
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_lt()
{
#if __CUDA_ARCH__ >= 200
u32 id;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(id));
return id;
#else
return (1u << warp_lane()) - 1u;
#endif
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_lte()
{
#if __CUDA_ARCH__ >= 200
u32 id;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(id));
return id;
#else
return (2u << warp_lane()) - 1u;
#endif
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
unsigned int
warp_lane_mask_gt()
{
#if __CUDA_ARCH__ >= 200
u32 id;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(id));
return id;
#else
return ~((2u << warp_lane()) - 1u);
#endif
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_gte()
{
#if __CUDA_ARCH__ >= 200
u32 id;
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(id));
return id;
#else
return ~((1u << warp_lane()) - 1u);
#endif
}
//
//
//
#define TYPE_SCRATCH u32
//
//
//
__shared__ volatile TYPE_SCRATCH scratch[WARP_SIZE+1]; // volatile is necessary
KERNEL_QUALIFIERS
void
kth_kernel(const u32 bits, TYPE_SCRATCH* const RESTRICT vout)
{
// init scratch
scratch[warp_lane()] = UINT8_MAX;
#if (__CUDA_ARCH__ >= 500)
// MAXWELL: IT APPEARS "LOW LANE WINS" BUT NOT ALWAYS ;)
// this is a more reliable and portable implementation:
if ((bits & warp_lane_mask_eq()) != 0)
scratch[__popc(bits & warp_lane_mask_lt())] = warp_lane();
#else
// FERMI/KEPLER: HIGH LANE WINS
// count all bits from lane to MSB
const u32 count = __popc(bits & warp_lane_mask_gte());
// subtract 1 and clamp to last index in scratch
const u32 idx = min(count-1u,WARP_SIZE);
// save using "high/low lane wins" feature
scratch[idx] = warp_lane();
#endif
// store
vout[warp_lane()] = scratch[warp_lane()];
}
//
//
//
int
main(int argc, char** argv)
{
const s32 device = (argc == 1) ? 0 : atoi(argv[1]);
const u32 bits = (argc <= 2) ? 0x55555555 : atoi(argv[2]);
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
printf("0x%X\n",bits);
cudaSetDevice(device);
//
//
//
TYPE_SCRATCH* vout_d;
cudaMalloc(&vout_d,WARP_SIZE * sizeof(TYPE_SCRATCH));
//
//
//
kth_kernel<<<1,WARP_SIZE>>>(bits,vout_d);
cudaDeviceSynchronize();
//
//
//
TYPE_SCRATCH vout_h[WARP_SIZE];
cudaMemcpy(vout_h,vout_d,
WARP_SIZE*sizeof(TYPE_SCRATCH),
cudaMemcpyDeviceToHost);
//
//
//
for (u32 ii=0; ii<WARP_SIZE; ii++)
printf("%3u ",(u32)vout_h[ii]);
printf("\n");
//
//
cudaDeviceReset();
return 0;
}
@allanmac
Copy link
Author

nvcc -m 32 -Xptxas=-v,-abi=no             \
    -gencode=arch=compute_20,code=sm_21     \
    -gencode=arch=compute_30,code=sm_30     \
    -gencode=arch=compute_35,code=sm_35     \
    -gencode=arch=compute_50,code=sm_50     \
    kth.cu -o kth

@allanmac
Copy link
Author

> kth
GeForce GTX 750 Ti ( 5)
0x55555555
  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 

> kth 1
GeForce GT 545 ( 3)
0x55555555
 30  28  26  24  22  20  18  16  14  12  10   8   6   4   2   0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 

> kth 2
GeForce GT 630 ( 2)
0x55555555
 30  28  26  24  22  20  18  15  14  12  10   8   6   4   2   0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 

> kth 3
GeForce GTX 680 ( 8)
0x55555555
 30  28  26  24  22  20  18  15  14  12  10   8   6   4   2   0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 

@allanmac
Copy link
Author

On sm_50 it works out to ~7 SASS instructions after loading the special registers:

        S2R R0, SR_EQMASK; 
        S2R R2, SR_LANEID;  
        LOP.AND R0, R0, c[0x0][0x140];   
        ISETP.EQ.AND P0, PT, R0, RZ, PT; 
        MOV32I R0, 0xff;                 
        STS.U8 [R2], R0;                 
        S2R R0, SR_LTMASK;               
        LOP.AND R0, R0, c[0x0][0x140];   
        POPC R0, R0;                     
        STS.U8 [R0], R2;                 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment