Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/0011cf0920e2b859b6ba to your computer and use it in GitHub Desktop.
Save allanmac/0011cf0920e2b859b6ba to your computer and use it in GitHub Desktop.
Blind attempt at getting f16v2 ops to work... Totally untested.
// -*- compile-command: "nvcc -m 32 -arch sm_52 -Xptxas=-v,-abi=no -cubin f16.cu" ; -*-
#include <stdint.h>
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
typedef int16_t s16;
typedef short2 s16v2;
typedef int32_t s32;
typedef uint32_t u32;
typedef int64_t s64;
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
add_f16x2(u32 a, u32 b)
{
u32 d;
asm("add.rn.ftz.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
mul_f16x2(u32 a, u32 b)
{
u32 d;
asm("mul.rn.ftz.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
fma_f16x2(u32 a, u32 b, u32 c)
{
u32 d;
asm("fma.rn.ftz.f16x2 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
return d;
}
/*
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
min_f16x2(u32 a, u32 b)
{
u32 d;
asm("min.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
max_f16x2(u32 a, u32 b)
{
u32 d;
asm("max.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
*/
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
add_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = add_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mul_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = mul_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
fma_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
const u32* const RESTRICT c,
u32* const RESTRICT d)
{
d[threadIdx.x] = fma_f16x2(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
/*
KERNEL_QUALIFIERS_EXTERN_C
void
min_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = min_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
max_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = max_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
*/
//
//
//
@allanmac
Copy link
Author

Compiling and dumping for sm_52 works but sm_50 won't dump and returns the error:

nvdisasm error : Unrecognized operation for functional unit 'uC' at address 0x00000048

That leads me to believe this is currently only an sm_52 feature.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment