Skip to content

Instantly share code, notes, and snippets.

@funnbot
Last active August 7, 2023 23:49
Show Gist options
  • Save funnbot/e807f0d9680dd5fa38648caa030f718e to your computer and use it in GitHub Desktop.
Save funnbot/e807f0d9680dd5fa38648caa030f718e to your computer and use it in GitHub Desktop.
/*
// From cuda docs:
__device__ int __dp4a ( int srcA, int srcB, int c )
Four-way signedint8 dot product with int32 accumulate.
Description
Extracts four pairs of packed byte-sized integers from scrA and srcB, then creates four pairwise products and adds them together to a signed 32-bit integer c.
*/
static __device__ __forceinline__ int32_t __dp4a(int32_t srcA, int32_t srcB, int32_t c) {
return amd_mixed_dot(srcA, srcB, c, /*saturate=*/ true);
}
// https://github.com/intel/llvm/blob/8e0cc4b7a845df9389a1313a3e680babc4d87782/sycl/source/detail/builtins_integer.cpp#L218
static __device__ __forceinline__ int8_t s_sub_sat(int8_t x, int8_t y) {
int8_t result = uint8_t(x) - uint8_t(y);
// Saturate result if (+) - (-) = (-) or (-) - (+) = (+).
if (((x < 0) ^ (y < 0)) && ((x < 0) ^ (result < 0))) {
result = result < 0 ? std::numeric_limits<int8_t>::max() : std::numeric_limits<int8_t>::min();
}
return result;
}
/*
// From cuda docs:
__device__ unsigned int __vsubss4 ( unsigned int a, unsigned int b )
Performs per-byte subtraction with signed saturation.
Returns
Returns computed value.
Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts function performs subtraction with signed saturation. Partial results are recombined and returned as unsigned int.
*/
static __device__ __forceinline__ uint32_t __vsubss4(uint32_t a, uint32_t b) {
// Is this more correct? (Didn't fix mmq)
uint32_t r;
__asm__("v_sub_nc_i32 %0,%1,%2,clamp;"
: "=v"(r)
: "r"(a), "r"(b));
return r;
// Suggested instruction (Didn't fix mmq)
// uint32_t r;
// __asm__("v_sub_nc_u32 %0,%1,%2;"
// : "=v"(r)
// : "r"(a), "r"(b));
// return r;
/// Proper hip types (Didn't fix mmq)
// char4 a4 = mapFrom<char4, uint32_t>(a);
// char4 b4 = mapFrom<char4, uint32_t>(b);
// char4 c = char4{
// s_sub_sat(a4.x, b4.x),
// s_sub_sat(a4.y, b4.y),
// s_sub_sat(a4.z, b4.z),
// s_sub_sat(a4.w, b4.w),
// };
// return mapFrom<uint32_t, char4>(c);
/// Bitwise (Didn't fix mmq)
// return (
// (s_sub_sat(a, b) & 0xFF) |
// ((s_sub_sat(a >> 8, b >> 8) << 8) & 0xFF00) |
// ((s_sub_sat(a >> 16, b >> 16) << 16) & 0xFF0000) |
// ((s_sub_sat(a >> 24, b >> 24) << 24) & 0xFF000000)
// );
/// From github comment (Didn't fix mmq)
// typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
// int8x4_t ap = int8x4_t(a);
// int8x4_t bp = int8x4_t(b);
// int8x4_t c = {
// s_sub_sat(ap.x, bp.x),
// s_sub_sat(ap.y, bp.y),
// s_sub_sat(ap.z, bp.z),
// s_sub_sat(ap.w, bp.w) };
// return *(unsigned int*)(&c);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment