Skip to content

Instantly share code, notes, and snippets.

@mntone
Created Dec 25, 2019
Embed
What would you like to do?
MMXで64-bit処理のためのuint16x4_t作ろうとしたけど命令数足りなさすぎて実用的じゃないので,SSE 128-bit最小単位にするように設計変更。ご自由にどうぞ。 >> under MIT License <<
#ifndef _SIMD_UINT16X4_H
#define _SIMD_UINT16X4_H
#include "simd_base.h"
#ifdef __cplusplus
extern "C" {
#endif
// ---
// x86/x86-64
// ---
#ifdef _SIMD_X86
#ifdef _SIMD_X86_AVX
#define _simd_mm_permute_ps(__a, __m) _mm_permute_ps(__a, __m);
#else
#define _simd_mm_permute_ps(__a, __m) _mm_shuffle_ps(__a, __a, __m);
#endif
static inline __m64 _simd_mm_not_si64(__m64 __a) {
return _mm_andnot_si64(__a, _mm_set1_pi16(0xFFFF));
}
static inline __m64 _simd_mm_sel_si64(__m64 __a, __m64 __b, __m64 __m) {
return _mm_or_si64(_mm_andnot_si64(__m, __a), _mm_and_si64(__m, __b));
}
#endif
// ---
// Constants
// ---
// clang-format off
#define UINT16X4_ZERO uint16x4_inits(0)
#define UINT16X4_ONE uint16x4_inits(1)
#define UINT16X4_MIN uint16x4_inits(INT16_MIN)
#define UINT16X4_MAX uint16x4_inits(INT16_MAX)
// clang-format on
// ---
// Inits
// ---
static inline uint16x4_t uint16x4_t_initv(uint16_t s0,
uint16_t s1,
uint16_t s2,
uint16_t s3) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = { s0, s1, s2, s3 };
#elif defined(_SIMD_X86_MMX)
ret = _mm_setr_pi16(s0, s1, s2, s3);
#else
ret.u16[0] = s0;
ret.u16[1] = s1;
ret.u16[2] = s2;
ret.u16[3] = s3;
#endif
return ret;
}
static inline uint16x4_t uint16x4_inits(uint16_t s) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vdup_n_u16(s);
#elif defined(_SIMD_X86_MMX)
ret = _mm_set1_pi16(s);
#else
for (size_t i = 0; i < 4; ++i) {
ret.u16[i] = s;
}
#endif
return ret;
}
static inline uint16x4_t uint16x4_inita(const float a[]) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vld1_u16(a);
#elif defined(_SIMD_X86_MMX)
ret = _mm_setr_pi16(a[0], a[1], a[2], a[3]);
#else
for (size_t i = 0; i < 4; ++i) {
ret.u16[i] = a[i];
}
#endif
return ret;
}
static inline uint16x4_t uint16x4_initp(const uint16_t *p) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vld1_dup_u16(p);
#elif defined(_SIMD_X86_MMX)
ret = _mm_set1_pi16(*p);
#else
float s = *p;
for (size_t i = 0; i < 4; ++i) {
ret.u16[i] = s;
}
#endif
return ret;
}
// ---
// Gets
// ---
#if defined(_SIMD_ARM_NEON)
#define uint16x4_getat(__v, __idx) vget_lane_u16(__v, __idx)
#define uint16x4_getatp(__v, __idx, __p) vst1_lane_u16(__p, __v, __idx)
#elif defined(_SIMD_X86_SSE2)
#define uint16x4_getat(__v, __idx) (uint16_t)_mm_extract_pi16((__v, __idx)
#define uint16x4_getatp(__v, __idx, __p) *__p = (uint16_t)_mm_extract_pi16((__v, __idx)
#else
#define uint16x4_getat(__v, __idx) __v.u16[__idx]
#define uint16x4_getatp(__v, __idx, __p) *__p = __v.u16[__idx]
#endif
// ---
// Sets
// ---
/*static inline uint16x4_t uint16x4_setat(uint16x8_t v, int index, uint16_t value) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vset_lane_u16(value, v, index);
#elif defined(_SIMD_X86_SSE)
ret = _mm_insert_pi16(v, value, index);
#else
ret.u16[index] = value;
#endif
return ret;
}*/
// ---
// Logic
// ---
// ~a
static inline uint16x4_t uint16x4_not(uint16x4_t a) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vmvn_u16(a);
#elif defined(_SIMD_X86_MMX)
ret = _simd_mm_not_si64(a);
#else
ret = __simd64_not(a);
#endif
return ret;
}
// a & b
static inline uint16x4_t uint16x4_and(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vand_u16(a, b);
#elif defined(_SIMD_X86_MMX)
ret = _mm_and_si64(a, b);
#else
ret = __simd64_and(a, b);
#endif
return ret;
}
// ~a & b
static inline uint16x4_t uint16x4_andnot(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vbic_u16(b, a);
#elif defined(_SIMD_X86_MMX)
ret = _mm_andnot_si64(a, b);
#else
ret = __simd64_andnot(a, b);
#endif
return ret;
}
// a | b
static inline uint16x4_t uint16x4_or(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vorr_u16(a, b);
#elif defined(_SIMD_X86_MMX)
ret = _mm_or_si64(a, b);
#else
ret = __simd64_or(a, b);
#endif
return ret;
}
// ~a | b
static inline uint16x4_t uint16x4_ornot(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vorn_u16(b, a);
#elif defined(_SIMD_X86_MMX)
ret = _mm_or_si64(_simd_mm_not_si64(a), b);
#else
ret = __simd64_ornot(a, b);
#endif
return ret;
}
// a ^ b
static inline uint16x4_t uint16x4_xor(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = veor_u16(a, b);
#elif defined(_SIMD_X86_MMX)
ret = _mm_xor_si64(a, b);
#else
ret = __simd64_xor(a, b);
#endif
return ret;
}
// (a & ~m) | (b & m)
static inline uint16x4_t uint16x4_sel(uint16x4_t a, uint16x4_t b, uint16x4_t m) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vbsl_u16(m, a, b);
#elif defined(_SIMD_X86_MMX)
ret = _simd_mm_sel_si64(a, b, m);
#else
ret = __simd64_sel(a, b, m);
#endif
return ret;
}
// a >> n
static inline uint16x4_t uint16x4_shr(uint16x4_t a, int n) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vshr_n_u16(a, n);
#elif defined(_SIMD_X86_MMX)
ret = _mm_slli_pi16(a, n);
#else
for (size_t i = 0; i < 4; ++i) {
ret.u16[i] = a.u16[i] >> n;
}
#endif
return ret;
}
// a << n
static inline uint16x4_t uint16x4_shl(uint16x4_t a, int n) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vshl_n_u16(a, n);
#elif defined(_SIMD_X86_MMX)
ret = _mm_slli_pi16(a, n);
#else
for (size_t i = 0; i < 4; ++i) {
ret.u16[i] = a.u16[i] << n;
}
#endif
return ret;
}
// ---
// Arithmetic
// ---
// a + b
static inline uint16x4_t uint16x4_add(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vadd_u16(a, b);
#elif defined(_SIMD_X86_SSE)
ret = _mm_add_pi16(a, b);
#else
for (size_t i = 0; i < 4; ++i) {
ret.u16[i] = a.u16[i] + b.u16[i];
}
#endif
return ret;
}
// saturate(a + b)
static inline uint16x4_t uint16x4_adds(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vqadd_u16(a, b);
#elif defined(_SIMD_X86_SSE)
ret = _mm_adds_pu16(a, b);
#else
for (size_t i = 0; i < 4; ++i) {
uint16_t ai = a.u16[i];
uint16_t bi = b.u16[i];
ret.u16[i] = (ai > UINT16_MAX - bi) ? UINT16_MAX : ai + bi;
}
#endif
return ret;
}
// a - b
static inline uint16x4_t uint16x4_sub(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vsub_u16(a, b);
#elif defined(_SIMD_X86_SSE)
ret = _mm_sub_pi16(a, b);
#else
for (size_t i = 0; i < 8; ++i) {
ret.u16[i] = a.u16[i] - b.u16[i];
}
#endif
return ret;
}
// saturate(a - b)
static inline uint16x4_t uint16x4_subs(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vqsub_u16(a, b);
#elif defined(_SIMD_X86_SSE)
ret = _mm_subs_pu16(a, b);
#else
for (size_t i = 0; i < 8; ++i) {
uint16_t ai = a.u16[i];
uint16_t bi = b.u16[i];
ret.u16[i] = ai < bi ? 0 : ai - bi;
}
#endif
return ret;
}
// ---
// Processing
// ---
// a0, b0, a1, b1
static inline uint16x4_t uint16x4_unpack_lo(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vzip_f32(a, b).val[0];
#elif defined(_SIMD_X86_MMX)
ret = _mm_unpacklo_pi16(a, b);
#else
for (size_t i = 0; i < 2; ++i) {
ret.u16[2 * i] = a.u16[i];
ret.u16[2 * i + 1] = b.u16[i];
}
#endif
return ret;
}
// a2, b2, a3, b3
static inline uint16x4_t uint16x4_unpack_hi(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vzip_f32(a, b).val[1];
#elif defined(_SIMD_X86_MMX)
ret = _mm_unpackhi_pi16(a, b);
#else
for (size_t i = 0; i < 2; ++i) {
ret.u16[2 * i] = a.u16[2 + i];
ret.u16[2 * i + 1] = b.u16[2 + i];
}
#endif
return ret;
}
// a0, b0, a1, b1 | a2, b2, a3, b3
static inline uint16x4x2_t uint16x4_unpack(uint16x4_t a, uint16x4_t b) {
uint16x4x2_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vzipq_f32(a.v, b.v);
#elif defined(_SIMD_X86_MMX)
ret.val[0] = _mm_unpacklo_pi16(a, b);
ret.val[1] = _mm_unpackhi_pi16(a, b);
#else
for (size_t i = 0; i < 2; ++i) {
ret.val[0].u16[2 * i] = a.u16[i];
ret.val[0].u16[2 * i + 1] = b.u16[i];
}
for (size_t i = 0; i < 2; ++i) {
ret.val[1].u16[2 * i] = a.u16[2 + i];
ret.val[1].u16[2 * i + 1] = b.u16[2 + i];
}
#endif
return ret;
}
// ---
// Compare
// ---
// [vector] min(a, b)
/*static inline uint16x4_t uint16x4_min(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vmin_u16(a, b);
#elif defined(_SIMD_X86_SSE)
ret = _mm_min_pu16(a, a, _simd_mm_cmpge_pu8(b, a));
#else
for (size_t i = 0; i < 4; ++i) {
uint16_t ai = a.u16[i];
uint16_t bi = b.u16[i];
ret.u16[i] = ai < bi ? ai : bi;
}
#endif
return ret;
}
// [scalar] min(a, s)
static inline uint16x4_t uint16x4_mins(uint16x4_t a, uint16_t s) {
return uint16x4_min(a, uint16x4_inits(s));
}
// [vector] max(a, b)
static inline uint16x4_t uint16x4_max(uint16x4_t a, uint16x4_t b) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vmax_u16(a, b);
#elif defined(_SIMD_X86_SSE)
ret = _mm_max_pu16(a, b);
#else
for (size_t i = 0; i < 4; ++i) {
uint16_t ai = a.u16[i];
uint16_t bi = b.u16[i];
ret.u16[i] = ai < bi ? bi : ai;
}
#endif
return ret;
}
// [scalar] max(a, s)
static inline uint16x4_t uint16x4_maxs(uint16x4_t a, uint16_t s) {
return uint16x4_max(a, uint16x4_inits(s));
}
// [vector] clamp(a, min, max)
static inline uint16x4_t uint16x4_clamp(uint16x4_t a, uint16x4_t min, uint16x4_t max) {
uint16x4_t ret;
#if defined(_SIMD_ARM_NEON)
ret = vmin_u16(vmax_u16(a, min), max);
#elif defined(_SIMD_X86_SSE)
ret = _mm_min_pu16(_mm_max_pu16(a, min), max);
#else
for (size_t i = 0; i < 4; ++i) {
uint16_t ai = a.u16[i];
uint16_t mini = min.u16[i];
uint16_t maxi = max.u16[i];
ret.u16[i] = ai < mini ? mini : (ai < maxi ? ai : maxi);
}
#endif
return ret;
}
// [scalar] clamp(a, min, max)
static inline uint16x4_t uint16x4_clamps(uint16x4_t a, uint16_t min, uint16_t max) {
return uint16x4_clamp(a, uint16x4_inits(min), uint16x4_inits(max));
}*/
#ifdef __cplusplus
}
#endif
#endif // _SIMD_UINT16X4_H
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment