Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/0011cf0920e2b859b6ba to your computer and use it in GitHub Desktop.
Save allanmac/0011cf0920e2b859b6ba to your computer and use it in GitHub Desktop.
Blind attempt at getting f16v2 ops to work... Totally untested.
// -*- compile-command: "nvcc -m 32 -arch sm_52 -Xptxas=-v,-abi=no -cubin f16.cu" ; -*-
#include <stdint.h>
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
typedef int16_t s16;
typedef short2 s16v2;
typedef int32_t s32;
typedef uint32_t u32;
typedef int64_t s64;
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
add_f16x2(u32 a, u32 b)
{
u32 d;
asm("add.rn.ftz.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
mul_f16x2(u32 a, u32 b)
{
u32 d;
asm("mul.rn.ftz.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
fma_f16x2(u32 a, u32 b, u32 c)
{
u32 d;
asm("fma.rn.ftz.f16x2 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
return d;
}
/*
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
min_f16x2(u32 a, u32 b)
{
u32 d;
asm("min.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
max_f16x2(u32 a, u32 b)
{
u32 d;
asm("max.f16x2 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
*/
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
add_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = add_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mul_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = mul_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
fma_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
const u32* const RESTRICT c,
u32* const RESTRICT d)
{
d[threadIdx.x] = fma_f16x2(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
/*
KERNEL_QUALIFIERS_EXTERN_C
void
min_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = min_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
max_f16x2_kernel(const u32* const RESTRICT a,
const u32* const RESTRICT b,
u32* const RESTRICT d)
{
d[threadIdx.x] = max_f16x2(a[threadIdx.x],
b[threadIdx.x]);
}
*/
//
//
//
@allanmac
Copy link
Author

cuobjdump.exe -sass f16.cubin

    code for sm_52
        Function : add_f16x2
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"

        S2R R3, SR_TID.X;
        ISCADD R0, R3.reuse, c[0x0][0x140], 0x2;
        ISCADD R1, R3.reuse, c[0x0][0x144], 0x2;
        LDG.CI R0, [R0];        }

        ISCADD R3, R3, c[0x0][0x148], 0x2;
        LDG.CI R1, [R1];        }

        HADD2.FTZ R2, R0, R1;  <------------------- YAY!
        STG [R3], R2;
        EXIT;

        BRA 0x60;
        NOP;
        NOP;
        ..........................


        Function : mul_f16x2
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"

        S2R R3, SR_TID.X;
        ISCADD R0, R3.reuse, c[0x0][0x140], 0x2;
        ISCADD R1, R3.reuse, c[0x0][0x144], 0x2;
        LDG.CI R0, [R0];        }

        ISCADD R3, R3, c[0x0][0x148], 0x2;
        LDG.CI R1, [R1];        }

        HMUL2.FTZ R2, R0, R1;  <------------------- YAY!
        STG [R3], R2;
        EXIT;

        BRA 0x60;
        NOP;
        NOP;
        ..........................

        Function : fma_f16x2
    .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"

        S2R R4, SR_TID.X;
        ISCADD R0, R4.reuse, c[0x0][0x140], 0x2;
        ISCADD R1, R4.reuse, c[0x0][0x144], 0x2;
        LDG.CI R0, [R0];        }

        ISCADD R2, R4.reuse, c[0x0][0x148], 0x2;
        LDG.CI R1, [R1];        }

        ISCADD R4, R4, c[0x0][0x14c], 0x2;
        LDG.CI R2, [R2];        }
        HFMA2.FTZ R3, R0, R1, R2;  <------------------- YAY!

        STG [R4], R3;
        EXIT;
        BRA 0x78;
        ..........................

@allanmac
Copy link
Author

Compiling and dumping for sm_52 works but sm_50 won't dump and returns the error:

nvdisasm error : Unrecognized operation for functional unit 'uC' at address 0x00000048

That leads me to believe this is currently only an sm_52 feature.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment