Skip to content

Instantly share code, notes, and snippets.

@oscarbg
Forked from allanmac/int_mul.cu
Last active August 29, 2015 14:13
Show Gist options
  • Save oscarbg/0d3fa07385bef69a8455 to your computer and use it in GitHub Desktop.
Save oscarbg/0d3fa07385bef69a8455 to your computer and use it in GitHub Desktop.
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin int_mul.cu" ; -*-
#include <stdint.h>
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
typedef int16_t s16;
typedef short2 s16v2;
typedef int32_t s32;
typedef uint32_t u32;
typedef int64_t s64;
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mul_wide_s16(s16 a, s16 b)
{
s32 d;
asm("mul.wide.s16 %0, %1, %2;" : "=r"(d) : "h"(a), "h"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mad_wide_s16(s16 a, s16 b, s32 c)
{
s32 d;
asm("mad.wide.s16 %0, %1, %2, %3;" : "=r"(d) : "h"(a), "h"(b), "r"(c));
return d;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mul_s32_s16(s32 a, s16 b)
{
return a * b;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mad_s32_s16(s32 a, s16 b, s32 c)
{
return a * b + c;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mul_lo_s32(s32 a, s32 b)
{
s32 d;
asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s32
mad_lo_s32(s32 a, s32 b, s32 c)
{
s32 d;
asm("mad.lo.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
return d;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mul_wide_s32(s32 a, s32 b)
{
s64 d;
asm("mul.wide.s32 %0, %1, %2;" : "=l"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mad_wide_s32(s32 a, s32 b, s64 c)
{
s64 d;
asm("mad.wide.s32 %0, %1, %2, %3;" : "=l"(d) : "r"(a), "r"(b), "l"(c));
return d;
}
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mul_s64(s64 a, s64 b)
{
s64 d;
asm("mul.lo.s64 %0, %1, %2;" : "=l"(d) : "l"(a), "l"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
s64
mad_s64(s64 a, s64 b, s64 c)
{
s64 d;
asm("mad.lo.s64 %0, %1, %2, %3;" : "=l"(d) : "l"(a), "l"(b), "l"(c));
return d;
}
//
//
//
/*
KERNEL_QUALIFIERS_EXTERN_C
void
mul_s16v2_kernel(const s16v2* const RESTRICT a,
const s16v2* const RESTRICT b,
s16v2* const RESTRICT d)
{
d[threadIdx.x] = mul_s16v2(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_s16v2_kernel(const s16v2* const RESTRICT a,
const s16v2* const RESTRICT b,
const s16v2* const RESTRICT c,
s16v2* const RESTRICT d)
{
d[threadIdx.x] = mad_s16v2(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
*/
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_wide_s16_kernel(const s16* const RESTRICT a,
const s16* const RESTRICT b,
s32* const RESTRICT d)
{
d[threadIdx.x] = mul_wide_s16(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_wide_s16_kernel(const s16* const RESTRICT a,
const s16* const RESTRICT b,
const s32* const RESTRICT c,
s32* const RESTRICT d)
{
d[threadIdx.x] = mad_wide_s16(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_s32_s16_kernel(const s32* const RESTRICT a,
const s16* const RESTRICT b,
s32* const RESTRICT d)
{
d[threadIdx.x] = mul_s32_s16(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_s32_s16_kernel(const s32* const RESTRICT a,
const s16* const RESTRICT b,
const s32* const RESTRICT c,
s32* const RESTRICT d)
{
d[threadIdx.x] = mad_s32_s16(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_lo_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
s32* const RESTRICT d)
{
d[threadIdx.x] = mul_lo_s32(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_lo_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
const s32* const RESTRICT c,
s32* const RESTRICT d)
{
d[threadIdx.x] = mad_lo_s32(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_wide_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
s64* const RESTRICT d)
{
d[threadIdx.x] = mul_wide_s32(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_wide_s32_kernel(const s32* const RESTRICT a,
const s32* const RESTRICT b,
const s64* const RESTRICT c,
s64* const RESTRICT d)
{
d[threadIdx.x] = mad_wide_s32(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
KERNEL_QUALIFIERS_EXTERN_C
void
mul_s64_kernel(const s64* const RESTRICT a,
const s64* const RESTRICT b,
s64* const RESTRICT d)
{
d[threadIdx.x] = mul_s64(a[threadIdx.x],
b[threadIdx.x]);
}
KERNEL_QUALIFIERS_EXTERN_C
void
mad_s64_kernel(const s64* const RESTRICT a,
const s64* const RESTRICT b,
const s64* const RESTRICT c,
s64* const RESTRICT d)
{
d[threadIdx.x] = mad_s64(a[threadIdx.x],
b[threadIdx.x],
c[threadIdx.x]);
}
//
//
//
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment