Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active May 29, 2016 16:36
Show Gist options
  • Save allanmac/d098943af1bb26dcfd42aa9f021b55d9 to your computer and use it in GitHub Desktop.
Save allanmac/d098943af1bb26dcfd42aa9f021b55d9 to your computer and use it in GitHub Desktop.
Pack two unit interval 15-bit mantissa floats into a 32-bit word. Described here: https://devtalk.nvidia.com/default/topic/937736/cuda-programming-and-performance/saturated-16-bit-1-15-float-implementation/
// -*- compile-command: "nvcc -arch sm_50 -Xptxas=-v -use_fast_math unit16v2.cu -o unit16v2"; -*-
#include <stdio.h>
#include <stdint.h>
//
//
//
#define WARP_SIZE 32
#define KERNEL_QUALIFIERS extern "C" __global__
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static __device__ __forceinline__
#define PXL_RESTRICT __restrict__
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
float
pxl_unit16v2_unpack_lo(const uint32_t unit16v2)
{
uint32_t d;
#if __CUDA_ARCH__ >= 200
asm("prmt.b32 %0, %1, 0x0, 0x7104;" : "=r"(d) : "r"(unit16v2));
#else
d = (unit16v2 & 0x000FFFF) << 8;
#endif
return __uint_as_float(d + 0x3f800000) - 1.0f;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
float
pxl_unit16v2_unpack_hi(const uint32_t unit16v2)
{
uint32_t d;
#if __CUDA_ARCH__ >= 200
asm("prmt.b32 %0, %1, 0x0, 0x7324;" : "=r"(d) : "r"(unit16v2));
#else
d = (unit16v2 & 0xFFFF0000) >> 8;
#endif
return __uint_as_float(d + 0x3f800000) - 1.0f;
}
//
//
//
#define PXL_UNIT16_PACK_MANTISSA_BIAS_ROUND (1.0f + 1.0f/65536.0f)
#define PXL_UNIT16_PACK_MANTISSA_BIAS_TRUNC 1.0f
#define PXL_UNIT16_PACK_MANTISSA_BIAS PXL_UNIT16_PACK_MANTISSA_BIAS_TRUNC
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
void
pxl_unit16v2_pack_lo(uint32_t* const unit16v2, const float v)
{
const uint32_t t = __float_as_uint(v + PXL_UNIT16_PACK_MANTISSA_BIAS) - 0x3f800000;
#if __CUDA_ARCH__ >= 200
asm("prmt.b32 %0, %1, %2, 0x3265;" : "=r"(*unit16v2) : "r"(*unit16v2), "r"(t));
#else
*unit16v2 = (*unit16v2 & 0xFFFF0000) | ((t >> 8) & 0x0000FFFF);
#endif
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
void
pxl_unit16v2_pack_hi(uint32_t* const unit16v2, const float v)
{
const uint32_t t = __float_as_uint(v + PXL_UNIT16_PACK_MANTISSA_BIAS) - 0x3f800000;
#if __CUDA_ARCH__ >= 200
asm("prmt.b32 %0, %1, %2, 0x6510;" : "=r"(*unit16v2) : "r"(*unit16v2), "r"(t));
#else
*unit16v2 = (*unit16v2 & 0x0000FFFF) | ((t << 8) & 0xFFFF0000);
#endif
}
//
//
//
KERNEL_QUALIFIERS
void
pxl_unit16v2_pack_kernel(uint2* const packed, const float4* const unpacked)
{
const int32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
float4 xyzw = unpacked[gidx];
uint2 unit16v2x2;
pxl_unit16v2_pack_lo(&unit16v2x2.x,xyzw.x);
pxl_unit16v2_pack_hi(&unit16v2x2.x,xyzw.y);
pxl_unit16v2_pack_lo(&unit16v2x2.y,xyzw.z);
pxl_unit16v2_pack_hi(&unit16v2x2.y,xyzw.w);
packed[gidx] = unit16v2x2;
}
//
//
//
KERNEL_QUALIFIERS
void
pxl_unit16v2_unpack_kernel(float4* const unpacked, const uint2* const packed)
{
const int32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
uint2 unit16v2x2 = packed[gidx];
float4 xyzw;
xyzw.x = pxl_unit16v2_unpack_lo(unit16v2x2.x);
xyzw.y = pxl_unit16v2_unpack_hi(unit16v2x2.x);
xyzw.z = pxl_unit16v2_unpack_lo(unit16v2x2.y);
xyzw.w = pxl_unit16v2_unpack_hi(unit16v2x2.y);
unpacked[gidx] = xyzw;
}
//
//
//
int
main(int argc, char** argv)
{
const int32_t device = (argc == 1) ? 0 : atoi(argv[1]);
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
cudaSetDevice(device);
//
//
//
#define PXL_COUNT WARP_SIZE
uint2* packed_d;
cudaMalloc(&packed_d,PXL_COUNT * sizeof(uint2));
float4* unpacked_d;
cudaMalloc(&unpacked_d,PXL_COUNT* sizeof(float4));
//
//
//
float4* unpacked_h = (float4*)malloc(PXL_COUNT * sizeof(float4));
for (int32_t ii=0; ii<PXL_COUNT; ii++)
unpacked_h[ii] = { 0.0f, 1.0f/3.0f, 0.15625 /*0.75f*/ /*0.9999847*/, 1.0f };
cudaMemcpy(unpacked_d,
unpacked_h,
PXL_COUNT * sizeof(float4),
cudaMemcpyHostToDevice);
//
//
//
pxl_unit16v2_pack_kernel <<<(PXL_COUNT+WARP_SIZE-1)/WARP_SIZE,WARP_SIZE>>>(packed_d,unpacked_d);
pxl_unit16v2_unpack_kernel<<<(PXL_COUNT+WARP_SIZE-1)/WARP_SIZE,WARP_SIZE>>>(unpacked_d,packed_d);
cudaDeviceSynchronize();
//
//
//
cudaMemcpy(unpacked_h,
unpacked_d,
PXL_COUNT * sizeof(float4),
cudaMemcpyDeviceToHost);
//
//
//
for (int32_t ii=0; ii<WARP_SIZE; ii++)
{
printf("%1.6f %1.6f %1.6f %1.6f\n",
unpacked_h[ii].x,
unpacked_h[ii].y,
unpacked_h[ii].z,
unpacked_h[ii].w);
}
//
//
//
cudaDeviceReset();
return 0;
}
@allanmac
Copy link
Author

float_example

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment