Last active
May 29, 2016 16:36
-
-
Save allanmac/d098943af1bb26dcfd42aa9f021b55d9 to your computer and use it in GitHub Desktop.
Pack two unit interval 15-bit mantissa floats into a 32-bit word. Described here: https://devtalk.nvidia.com/default/topic/937736/cuda-programming-and-performance/saturated-16-bit-1-15-float-implementation/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -arch sm_50 -Xptxas=-v -use_fast_math unit16v2.cu -o unit16v2"; -*- | |
#include <stdio.h> | |
#include <stdint.h> | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define KERNEL_QUALIFIERS extern "C" __global__ | |
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static __device__ __forceinline__ | |
#define PXL_RESTRICT __restrict__ | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
float | |
pxl_unit16v2_unpack_lo(const uint32_t unit16v2) | |
{ | |
uint32_t d; | |
#if __CUDA_ARCH__ >= 200 | |
asm("prmt.b32 %0, %1, 0x0, 0x7104;" : "=r"(d) : "r"(unit16v2)); | |
#else | |
d = (unit16v2 & 0x000FFFF) << 8; | |
#endif | |
return __uint_as_float(d + 0x3f800000) - 1.0f; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
float | |
pxl_unit16v2_unpack_hi(const uint32_t unit16v2) | |
{ | |
uint32_t d; | |
#if __CUDA_ARCH__ >= 200 | |
asm("prmt.b32 %0, %1, 0x0, 0x7324;" : "=r"(d) : "r"(unit16v2)); | |
#else | |
d = (unit16v2 & 0xFFFF0000) >> 8; | |
#endif | |
return __uint_as_float(d + 0x3f800000) - 1.0f; | |
} | |
// | |
// | |
// | |
#define PXL_UNIT16_PACK_MANTISSA_BIAS_ROUND (1.0f + 1.0f/65536.0f) | |
#define PXL_UNIT16_PACK_MANTISSA_BIAS_TRUNC 1.0f | |
#define PXL_UNIT16_PACK_MANTISSA_BIAS PXL_UNIT16_PACK_MANTISSA_BIAS_TRUNC | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
void | |
pxl_unit16v2_pack_lo(uint32_t* const unit16v2, const float v) | |
{ | |
const uint32_t t = __float_as_uint(v + PXL_UNIT16_PACK_MANTISSA_BIAS) - 0x3f800000; | |
#if __CUDA_ARCH__ >= 200 | |
asm("prmt.b32 %0, %1, %2, 0x3265;" : "=r"(*unit16v2) : "r"(*unit16v2), "r"(t)); | |
#else | |
*unit16v2 = (*unit16v2 & 0xFFFF0000) | ((t >> 8) & 0x0000FFFF); | |
#endif | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
void | |
pxl_unit16v2_pack_hi(uint32_t* const unit16v2, const float v) | |
{ | |
const uint32_t t = __float_as_uint(v + PXL_UNIT16_PACK_MANTISSA_BIAS) - 0x3f800000; | |
#if __CUDA_ARCH__ >= 200 | |
asm("prmt.b32 %0, %1, %2, 0x6510;" : "=r"(*unit16v2) : "r"(*unit16v2), "r"(t)); | |
#else | |
*unit16v2 = (*unit16v2 & 0x0000FFFF) | ((t << 8) & 0xFFFF0000); | |
#endif | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS | |
void | |
pxl_unit16v2_pack_kernel(uint2* const packed, const float4* const unpacked) | |
{ | |
const int32_t gidx = blockDim.x * blockIdx.x + threadIdx.x; | |
float4 xyzw = unpacked[gidx]; | |
uint2 unit16v2x2; | |
pxl_unit16v2_pack_lo(&unit16v2x2.x,xyzw.x); | |
pxl_unit16v2_pack_hi(&unit16v2x2.x,xyzw.y); | |
pxl_unit16v2_pack_lo(&unit16v2x2.y,xyzw.z); | |
pxl_unit16v2_pack_hi(&unit16v2x2.y,xyzw.w); | |
packed[gidx] = unit16v2x2; | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS | |
void | |
pxl_unit16v2_unpack_kernel(float4* const unpacked, const uint2* const packed) | |
{ | |
const int32_t gidx = blockDim.x * blockIdx.x + threadIdx.x; | |
uint2 unit16v2x2 = packed[gidx]; | |
float4 xyzw; | |
xyzw.x = pxl_unit16v2_unpack_lo(unit16v2x2.x); | |
xyzw.y = pxl_unit16v2_unpack_hi(unit16v2x2.x); | |
xyzw.z = pxl_unit16v2_unpack_lo(unit16v2x2.y); | |
xyzw.w = pxl_unit16v2_unpack_hi(unit16v2x2.y); | |
unpacked[gidx] = xyzw; | |
} | |
// | |
// | |
// | |
int | |
main(int argc, char** argv) | |
{ | |
const int32_t device = (argc == 1) ? 0 : atoi(argv[1]); | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
cudaSetDevice(device); | |
// | |
// | |
// | |
#define PXL_COUNT WARP_SIZE | |
uint2* packed_d; | |
cudaMalloc(&packed_d,PXL_COUNT * sizeof(uint2)); | |
float4* unpacked_d; | |
cudaMalloc(&unpacked_d,PXL_COUNT* sizeof(float4)); | |
// | |
// | |
// | |
float4* unpacked_h = (float4*)malloc(PXL_COUNT * sizeof(float4)); | |
for (int32_t ii=0; ii<PXL_COUNT; ii++) | |
unpacked_h[ii] = { 0.0f, 1.0f/3.0f, 0.15625 /*0.75f*/ /*0.9999847*/, 1.0f }; | |
cudaMemcpy(unpacked_d, | |
unpacked_h, | |
PXL_COUNT * sizeof(float4), | |
cudaMemcpyHostToDevice); | |
// | |
// | |
// | |
pxl_unit16v2_pack_kernel <<<(PXL_COUNT+WARP_SIZE-1)/WARP_SIZE,WARP_SIZE>>>(packed_d,unpacked_d); | |
pxl_unit16v2_unpack_kernel<<<(PXL_COUNT+WARP_SIZE-1)/WARP_SIZE,WARP_SIZE>>>(unpacked_d,packed_d); | |
cudaDeviceSynchronize(); | |
// | |
// | |
// | |
cudaMemcpy(unpacked_h, | |
unpacked_d, | |
PXL_COUNT * sizeof(float4), | |
cudaMemcpyDeviceToHost); | |
// | |
// | |
// | |
for (int32_t ii=0; ii<WARP_SIZE; ii++) | |
{ | |
printf("%1.6f %1.6f %1.6f %1.6f\n", | |
unpacked_h[ii].x, | |
unpacked_h[ii].y, | |
unpacked_h[ii].z, | |
unpacked_h[ii].w); | |
} | |
// | |
// | |
// | |
cudaDeviceReset(); | |
return 0; | |
} |
Author
allanmac
commented
May 25, 2016
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment