Last active
August 29, 2015 14:13
-
-
Save allanmac/0011cf0920e2b859b6ba to your computer and use it in GitHub Desktop.
Blind attempt at getting f16v2 ops to work... Totally untested.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 32 -arch sm_52 -Xptxas=-v,-abi=no -cubin f16.cu" ; -*- | |
#include <stdint.h> | |
// | |
// | |
// | |
#define KERNEL_QUALIFIERS __global__ | |
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS | |
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS | |
// | |
// | |
// | |
#ifndef _DEBUG | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
#else | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ | |
#endif | |
// | |
// | |
// | |
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS | |
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS | |
// | |
// | |
// | |
#define RESTRICT __restrict__ | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
typedef int16_t s16; | |
typedef short2 s16v2; | |
typedef int32_t s32; | |
typedef uint32_t u32; | |
typedef int64_t s64; | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
add_f16x2(u32 a, u32 b) | |
{ | |
u32 d; | |
asm("add.rn.ftz.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
mul_f16x2(u32 a, u32 b) | |
{ | |
u32 d; | |
asm("mul.rn.ftz.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
fma_f16x2(u32 a, u32 b, u32 c) | |
{ | |
u32 d; | |
asm("fma.rn.ftz.f16x2 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c)); | |
return d; | |
} | |
/* | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
min_f16x2(u32 a, u32 b) | |
{ | |
u32 d; | |
asm("min.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u32 | |
max_f16x2(u32 a, u32 b) | |
{ | |
u32 d; | |
asm("max.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
*/ | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
add_f16x2_kernel(const u32* const RESTRICT a, | |
const u32* const RESTRICT b, | |
u32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = add_f16x2(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_f16x2_kernel(const u32* const RESTRICT a, | |
const u32* const RESTRICT b, | |
u32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_f16x2(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
fma_f16x2_kernel(const u32* const RESTRICT a, | |
const u32* const RESTRICT b, | |
const u32* const RESTRICT c, | |
u32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = fma_f16x2(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
/* | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
min_f16x2_kernel(const u32* const RESTRICT a, | |
const u32* const RESTRICT b, | |
u32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = min_f16x2(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
max_f16x2_kernel(const u32* const RESTRICT a, | |
const u32* const RESTRICT b, | |
u32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = max_f16x2(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
*/ | |
// | |
// | |
// |
Compiling and dumping for sm_52 works but sm_50 won't dump and returns the error:
nvdisasm error : Unrecognized operation for functional unit 'uC' at address 0x00000048
That leads me to believe this is currently only an sm_52 feature.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cuobjdump.exe -sass f16.cubin