Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active November 1, 2018 04:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save allanmac/8973f01a2e5b2aa5a994 to your computer and use it in GitHub Desktop.
Save allanmac/8973f01a2e5b2aa5a994 to your computer and use it in GitHub Desktop.
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin int_mul.cu" ; -*-
#include <stdint.h>
#include <cuda_fp16.h>
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
typedef int16_t s16;
typedef short2 s16v2;
typedef uint16_t u16;
typedef ushort2 u16v2;
typedef int32_t s32;
typedef uint32_t u32;
typedef int64_t s64;
//
//
//
typedef u16 q16;
union q16v2
{
u32 lohi;
struct {
q16 lo;
q16 hi;
};
};
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
q16v2
mad_q16v2(union q16v2 a, union q16v2 b, union q16v2 c)
{
u32 d,e;
asm("vmad.u32.u32.u32.shr15 %0, %1.h0, %2.h0, %3;" : "=r"(d) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.lo));
asm("vmad.u32.u32.u32.shr15 %0, %1.h1, %2.h1, %3;" : "=r"(e) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.hi));
q16v2 r;
r.lo = d;
r.hi = e;
return r;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
half2
fma_half2(half2 a, half2 b, half2 c)
{
#if __CUDA_ARCH__ >= 530
return __hfma2(a,b,c);
#else
return __floats2half2_rn(fmaf( __low2float(a), __low2float(b), __low2float(c)),
fmaf(__high2float(a),__high2float(b),__high2float(c)));
#endif
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
u16v2
mad_u16v2(u16v2 a, u16v2 b, u32 c)
{
u16v2 r;
{
u32 d;
asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.x), "h"(b.x), "r"(c));
r.x = d >> 15;
}
{
u32 d;
asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.y), "h"(b.y), "r"(c));
r.y = d >> 15;
}
return r;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mul_wide_s16(s16 a, s16 b)
{
s32 d;
asm("mul.wide.s16 %0, %1, %2;" : "=r"(d) : "h"(a), "h"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mad_wide_s16(s16 a, s16 b, s32 c)
{
s32 d;
asm("mad.wide.s16 %0, %1, %2, %3;" : "=r"(d) : "h"(a), "h"(b), "r"(c));
return d;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mul_s32_s16(s32 a, s16 b)
{
return a * b;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mad_s32_s16(s32 a, s16 b, s32 c)
{
return a * b + c;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mul_lo_s32(s32 a, s32 b)
{
s32 d;
asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mad_lo_s32(s32 a, s32 b, s32 c)
{
s32 d;
asm("mad.lo.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
return d;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mul_wide_s32(s32 a, s32 b)
{
s64 d;
asm("mul.wide.s32 %0, %1, %2;" : "=l"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mad_wide_s32(s32 a, s32 b, s64 c)
{
s64 d;
asm("mad.wide.s32 %0, %1, %2, %3;" : "=l"(d) : "r"(a), "r"(b), "l"(c));
return d;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mul_s64(s64 a, s64 b)
{
s64 d;
asm("mul.lo.s64 %0, %1, %2;" : "=l"(d) : "l"(a), "l"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mad_s64(s64 a, s64 b, s64 c)
{
s64 d;
asm("mad.lo.s64 %0, %1, %2, %3;" : "=l"(d) : "l"(a), "l"(b), "l"(c));
return d;
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mad_q16v2_kernel(const union q16v2* const RESTRICT a,
const union q16v2* const RESTRICT b,
const union q16v2* const RESTRICT c,
union q16v2* const RESTRICT d)
{
d[threadIdx.x] = mad_q16v2(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
fma_half2_kernel(const half2* const RESTRICT a,
const half2* const RESTRICT b,
const half2* const RESTRICT c,
half2* const RESTRICT d)
{
d[threadIdx.x] = fma_half2(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mad_u16v2_kernel(const u16v2* const RESTRICT a,
const u16v2* const RESTRICT b,
const u32* const RESTRICT c,
u16v2* const RESTRICT d)
{
d[threadIdx.x] = mad_u16v2(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_wide_s16_kernel(const s16* const RESTRICT a,
const s16* const RESTRICT b,
s32* const RESTRICT d)
{
d[threadIdx.x] = mul_wide_s16(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_wide_s16_kernel(const s16* const RESTRICT a,
const s16* const RESTRICT b,
const s32* const RESTRICT c,
s32* const RESTRICT d)
{
d[threadIdx.x] = mad_wide_s16(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_s32_s16_kernel(const s32* const RESTRICT a,
const s16* const RESTRICT b,
s32* const RESTRICT d)
{
d[threadIdx.x] = mul_s32_s16(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_s32_s16_kernel(const s32* const RESTRICT a,
const s16* const RESTRICT b,
const s32* const RESTRICT c,
s32* const RESTRICT d)
{
d[threadIdx.x] = mad_s32_s16(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_lo_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
s32* const RESTRICT d)
{
d[threadIdx.x] = mul_lo_s32(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_lo_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
const s32* const RESTRICT c,
s32* const RESTRICT d)
{
d[threadIdx.x] = mad_lo_s32(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_wide_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
s64* const RESTRICT d)
{
d[threadIdx.x] = mul_wide_s32(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_wide_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
const s64* const RESTRICT c,
s64* const RESTRICT d)
{
d[threadIdx.x] = mad_wide_s32(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_s64_kernel(const s64* const RESTRICT a,
const s64* const RESTRICT b,
s64* const RESTRICT d)
{
d[threadIdx.x] = mul_s64(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_s64_kernel(const s64* const RESTRICT a,
const s64* const RESTRICT b,
const s64* const RESTRICT c,
s64* const RESTRICT d)
{
d[threadIdx.x] = mad_s64(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
@allanmac
Copy link
Author

Kepler (sm_35):

code for sm_35
        Function : mul_wide_s32_kernel
        /*0048*/                   IMUL.HI R5, R1, R0;
        /*0050*/                   IMUL R4, R1, R0;
        ....................................


        Function : mul_lo_s32_kernel
        /*0048*/                   IMUL R0, R1, R0;
        ..................................


        Function : mul_s64_kernel
        /*0048*/                   IMUL.U32.U32 R6.CC, R8, R4;
        /*0050*/                   IMAD.U32.U32.HI.X R2.CC, R8, R4, RZ;
        /*0058*/                   IMAD.U32.U32.X R1, R9, R4, R2;
        /*0060*/                   IMAD.U32.U32 R7, R8, R5, R1;
        ...............................


        Function : mad_wide_s32_kernel
        /*0058*/                   IMAD R4.CC, R1, R0, R6;
        /*0060*/                   IMAD.HI.X R5, R1, R0, R7;
        ....................................


        Function : mad_lo_s32_kernel
        /*0058*/                   IMAD R0, R4, R1, R0;
        ..................................


        Function : mad_s64_kernel
        /*0058*/                   IMUL.U32.U32 R6.CC, R8, R4;
        /*0060*/                   IMAD.U32.U32.HI.X R7.CC, R8, R4, RZ;
        /*0068*/                   IMAD.U32.U32.X R1, R9, R4, R7;
        /*0078*/                   IADD R2.CC, R6, R2;
        /*0088*/                   IMAD.U32.U32 R4, R8, R5, R1;
        /*0090*/                   IADD.X R3, R4, R3;
        ...............................

@allanmac
Copy link
Author

Maxwell v2 (sm_52)

code for sm_52
        Function : mul_wide_s32_kernel
        /*0038*/                   XMAD R3, R1.reuse, R0.reuse, RZ;
        /*0048*/                   XMAD.MRG R4, R1.reuse, R0.H1.reuse, RZ;
        /*0050*/                   XMAD.U16.S16 R5, R1.reuse, R0.H1.reuse, RZ;
        /*0058*/                   XMAD.S16.S16.CSFU R6, R1.H1.reuse, R0.H1.reuse, RZ;
        /*0068*/                   XMAD.S16.U16.CHI R7, R1.H1.reuse, R0, R3.reuse;
        /*0070*/                   XMAD.PSL.CBCC R0, R1.H1, R4.H1, R3;
        /*0088*/                   IADD3.RS R1, R7, R5, R6;
        ....................................


        Function : mul_lo_s32_kernel
        /*0048*/                   XMAD R2, R0.reuse, R1.reuse, RZ;
        /*0050*/                   XMAD.MRG R4, R0.reuse, R1.H1, RZ;
        /*0058*/                   XMAD.PSL.CBCC R2, R0.H1, R4.H1, R2;
        ..................................


        Function : mul_s64_kernel
        /*0038*/                   XMAD R2, R10.reuse, R0.reuse, RZ;
        /*0048*/                   XMAD R6, R11.reuse, R0.reuse, RZ;
        /*0050*/                   XMAD.MRG R7, R11.reuse, R0.H1.reuse, RZ;
        /*0058*/                   XMAD R3, R10.reuse, R0.H1.reuse, RZ;
        /*0068*/                   XMAD R4, R10.H1.reuse, R0.H1.reuse, RZ;
        /*0070*/                   XMAD R8, R10.reuse, R1.reuse, RZ;
        /*0078*/                   XMAD.MRG R9, R10.reuse, R1.H1, RZ;
        /*0088*/                   XMAD.CHI R5, R10.H1.reuse, R0.reuse, R2.reuse;
        /*0090*/                   XMAD.MRG R1, R10.reuse, R0.H1, RZ;
        /*0098*/                   XMAD.PSL.CBCC R6, R11.H1, R7.H1, R6;
        /*00a8*/                   XMAD.PSL.CBCC R7, R10.H1.reuse, R9.H1, R8;
        /*00b0*/                   IADD3.RS R3, R5, R3, R4;
        /*00b8*/                   XMAD.PSL.CBCC R0, R10.H1, R1.H1, R2;
        /*00d0*/                   IADD3 R1, R7, R3, R6;
        ...............................


        Function : mad_wide_s32_kernel
        /*0050*/                   XMAD R4, R7.reuse, R2.reuse, RZ;
        /*0058*/                   XMAD.MRG R3, R7.reuse, R2.H1.reuse, RZ;
        /*0068*/                   XMAD.PSL.CBCC R3, R7.H1.reuse, R3.H1, R4.reuse;
        /*0070*/                   XMAD.U16.S16 R5, R7.reuse, R2.H1.reuse, RZ;
        /*0078*/                   XMAD.S16.S16.CSFU R6, R7.H1.reuse, R2.H1.reuse, RZ;
        /*0088*/                   IADD R0.CC, R0, R3;
        /*0090*/                   XMAD.S16.U16.CHI R3, R7.H1, R2, R4;
        /*0098*/                   IADD3.RS R3, R3, R5, R6;
        /*00b0*/                   IADD.X R1, R1, R3;
        ....................................


        Function : mad_lo_s32_kernel
        /*0058*/                   XMAD.MRG R5, R0.reuse, R1.H1.reuse, RZ;
        /*0068*/                   XMAD R3, R0.reuse, R1, R2;
        /*0070*/                   XMAD.PSL.CBCC R3, R0.H1, R5.H1, R3;
        ..................................


        Function : mad_s64_kernel
        /*0050*/                   XMAD R4, R12.reuse, R0.reuse, RZ;
        /*0058*/                   XMAD.MRG R5, R12.reuse, R0.H1.reuse, RZ;
        /*0068*/                   XMAD R6, R12.reuse, R0.reuse, RZ;
        /*0070*/                   XMAD R7, R13.reuse, R0.reuse, RZ;
        /*0078*/                   XMAD.MRG R8, R13.reuse, R0.H1.reuse, RZ;
        /*0088*/                   XMAD R9, R12.reuse, R1.reuse, RZ;
        /*0090*/                   XMAD.MRG R10, R12.reuse, R1.H1, RZ;
        /*0098*/                   XMAD.PSL.CBCC R4, R12.H1.reuse, R5.H1, R4;
        /*00a8*/                   XMAD R1, R12.reuse, R0.H1.reuse, RZ;
        /*00b0*/                   XMAD R5, R12.H1.reuse, R0.H1.reuse, RZ;
        /*00b8*/                   XMAD.CHI R14, R12.H1.reuse, R0, R6;
        /*00c8*/                   XMAD.PSL.CBCC R6, R13.H1, R8.H1, R7;
        /*00d0*/                   XMAD.PSL.CBCC R7, R12.H1, R10.H1, R9;
        /*00d8*/                   IADD R0.CC, R2, R4;
        /*00e8*/                   IADD3.RS R1, R14, R1, R5;
        /*00f0*/                   IADD3 R1, R7, R1, R6;
        /*0108*/                   IADD.X R1, R3, R1;
        ...............................

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment