Created
December 25, 2019 12:52
-
-
Save mntone/acb7e801702b17eca115c3beed72a1c2 to your computer and use it in GitHub Desktop.
MMXで64-bit処理のためのuint16x4_t作ろうとしたけど命令数足りなさすぎて実用的じゃないので,SSE 128-bit最小単位にするように設計変更。ご自由にどうぞ。 >> under MIT License <<
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef _SIMD_UINT16X4_H | |
#define _SIMD_UINT16X4_H | |
#include "simd_base.h" | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
// --- | |
// x86/x86-64 | |
// --- | |
#ifdef _SIMD_X86 | |
#ifdef _SIMD_X86_AVX | |
#define _simd_mm_permute_ps(__a, __m) _mm_permute_ps(__a, __m); | |
#else | |
#define _simd_mm_permute_ps(__a, __m) _mm_shuffle_ps(__a, __a, __m); | |
#endif | |
static inline __m64 _simd_mm_not_si64(__m64 __a) { | |
return _mm_andnot_si64(__a, _mm_set1_pi16(0xFFFF)); | |
} | |
static inline __m64 _simd_mm_sel_si64(__m64 __a, __m64 __b, __m64 __m) { | |
return _mm_or_si64(_mm_andnot_si64(__m, __a), _mm_and_si64(__m, __b)); | |
} | |
#endif | |
// --- | |
// Constants | |
// --- | |
// clang-format off | |
#define UINT16X4_ZERO uint16x4_inits(0) | |
#define UINT16X4_ONE uint16x4_inits(1) | |
#define UINT16X4_MIN uint16x4_inits(INT16_MIN) | |
#define UINT16X4_MAX uint16x4_inits(INT16_MAX) | |
// clang-format on | |
// --- | |
// Inits | |
// --- | |
static inline uint16x4_t uint16x4_t_initv(uint16_t s0, | |
uint16_t s1, | |
uint16_t s2, | |
uint16_t s3) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = { s0, s1, s2, s3 }; | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_setr_pi16(s0, s1, s2, s3); | |
#else | |
ret.u16[0] = s0; | |
ret.u16[1] = s1; | |
ret.u16[2] = s2; | |
ret.u16[3] = s3; | |
#endif | |
return ret; | |
} | |
static inline uint16x4_t uint16x4_inits(uint16_t s) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vdup_n_u16(s); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_set1_pi16(s); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
ret.u16[i] = s; | |
} | |
#endif | |
return ret; | |
} | |
static inline uint16x4_t uint16x4_inita(const float a[]) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vld1_u16(a); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_setr_pi16(a[0], a[1], a[2], a[3]); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
ret.u16[i] = a[i]; | |
} | |
#endif | |
return ret; | |
} | |
static inline uint16x4_t uint16x4_initp(const uint16_t *p) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vld1_dup_u16(p); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_set1_pi16(*p); | |
#else | |
float s = *p; | |
for (size_t i = 0; i < 4; ++i) { | |
ret.u16[i] = s; | |
} | |
#endif | |
return ret; | |
} | |
// --- | |
// Gets | |
// --- | |
#if defined(_SIMD_ARM_NEON) | |
#define uint16x4_getat(__v, __idx) vget_lane_u16(__v, __idx) | |
#define uint16x4_getatp(__v, __idx, __p) vst1_lane_u16(__p, __v, __idx) | |
#elif defined(_SIMD_X86_SSE2) | |
#define uint16x4_getat(__v, __idx) (uint16_t)_mm_extract_pi16((__v, __idx) | |
#define uint16x4_getatp(__v, __idx, __p) *__p = (uint16_t)_mm_extract_pi16((__v, __idx) | |
#else | |
#define uint16x4_getat(__v, __idx) __v.u16[__idx] | |
#define uint16x4_getatp(__v, __idx, __p) *__p = __v.u16[__idx] | |
#endif | |
// --- | |
// Sets | |
// --- | |
/*static inline uint16x4_t uint16x4_setat(uint16x8_t v, int index, uint16_t value) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vset_lane_u16(value, v, index); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_insert_pi16(v, value, index); | |
#else | |
ret.u16[index] = value; | |
#endif | |
return ret; | |
}*/ | |
// --- | |
// Logic | |
// --- | |
// ~a | |
static inline uint16x4_t uint16x4_not(uint16x4_t a) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vmvn_u16(a); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _simd_mm_not_si64(a); | |
#else | |
ret = __simd64_not(a); | |
#endif | |
return ret; | |
} | |
// a & b | |
static inline uint16x4_t uint16x4_and(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vand_u16(a, b); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_and_si64(a, b); | |
#else | |
ret = __simd64_and(a, b); | |
#endif | |
return ret; | |
} | |
// ~a & b | |
static inline uint16x4_t uint16x4_andnot(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vbic_u16(b, a); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_andnot_si64(a, b); | |
#else | |
ret = __simd64_andnot(a, b); | |
#endif | |
return ret; | |
} | |
// a | b | |
static inline uint16x4_t uint16x4_or(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vorr_u16(a, b); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_or_si64(a, b); | |
#else | |
ret = __simd64_or(a, b); | |
#endif | |
return ret; | |
} | |
// ~a | b | |
static inline uint16x4_t uint16x4_ornot(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vorn_u16(b, a); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_or_si64(_simd_mm_not_si64(a), b); | |
#else | |
ret = __simd64_ornot(a, b); | |
#endif | |
return ret; | |
} | |
// a ^ b | |
static inline uint16x4_t uint16x4_xor(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = veor_u16(a, b); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_xor_si64(a, b); | |
#else | |
ret = __simd64_xor(a, b); | |
#endif | |
return ret; | |
} | |
// (a & ~m) | (b & m) | |
static inline uint16x4_t uint16x4_sel(uint16x4_t a, uint16x4_t b, uint16x4_t m) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vbsl_u16(m, a, b); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _simd_mm_sel_si64(a, b, m); | |
#else | |
ret = __simd64_sel(a, b, m); | |
#endif | |
return ret; | |
} | |
// a >> n | |
static inline uint16x4_t uint16x4_shr(uint16x4_t a, int n) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vshr_n_u16(a, n); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_slli_pi16(a, n); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
ret.u16[i] = a.u16[i] >> n; | |
} | |
#endif | |
return ret; | |
} | |
// a << n | |
static inline uint16x4_t uint16x4_shl(uint16x4_t a, int n) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vshl_n_u16(a, n); | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_slli_pi16(a, n); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
ret.u16[i] = a.u16[i] << n; | |
} | |
#endif | |
return ret; | |
} | |
// --- | |
// Arithmetic | |
// --- | |
// a + b | |
static inline uint16x4_t uint16x4_add(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vadd_u16(a, b); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_add_pi16(a, b); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
ret.u16[i] = a.u16[i] + b.u16[i]; | |
} | |
#endif | |
return ret; | |
} | |
// saturate(a + b) | |
static inline uint16x4_t uint16x4_adds(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vqadd_u16(a, b); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_adds_pu16(a, b); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
uint16_t ai = a.u16[i]; | |
uint16_t bi = b.u16[i]; | |
ret.u16[i] = (ai > UINT16_MAX - bi) ? UINT16_MAX : ai + bi; | |
} | |
#endif | |
return ret; | |
} | |
// a - b | |
static inline uint16x4_t uint16x4_sub(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vsub_u16(a, b); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_sub_pi16(a, b); | |
#else | |
for (size_t i = 0; i < 8; ++i) { | |
ret.u16[i] = a.u16[i] - b.u16[i]; | |
} | |
#endif | |
return ret; | |
} | |
// saturate(a - b) | |
static inline uint16x4_t uint16x4_subs(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vqsub_u16(a, b); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_subs_pu16(a, b); | |
#else | |
for (size_t i = 0; i < 8; ++i) { | |
uint16_t ai = a.u16[i]; | |
uint16_t bi = b.u16[i]; | |
ret.u16[i] = ai < bi ? 0 : ai - bi; | |
} | |
#endif | |
return ret; | |
} | |
// --- | |
// Processing | |
// --- | |
// a0, b0, a1, b1 | |
static inline uint16x4_t uint16x4_unpack_lo(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vzip_f32(a, b).val[0]; | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_unpacklo_pi16(a, b); | |
#else | |
for (size_t i = 0; i < 2; ++i) { | |
ret.u16[2 * i] = a.u16[i]; | |
ret.u16[2 * i + 1] = b.u16[i]; | |
} | |
#endif | |
return ret; | |
} | |
// a2, b2, a3, b3 | |
static inline uint16x4_t uint16x4_unpack_hi(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vzip_f32(a, b).val[1]; | |
#elif defined(_SIMD_X86_MMX) | |
ret = _mm_unpackhi_pi16(a, b); | |
#else | |
for (size_t i = 0; i < 2; ++i) { | |
ret.u16[2 * i] = a.u16[2 + i]; | |
ret.u16[2 * i + 1] = b.u16[2 + i]; | |
} | |
#endif | |
return ret; | |
} | |
// a0, b0, a1, b1 | a2, b2, a3, b3 | |
static inline uint16x4x2_t uint16x4_unpack(uint16x4_t a, uint16x4_t b) { | |
uint16x4x2_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vzipq_f32(a.v, b.v); | |
#elif defined(_SIMD_X86_MMX) | |
ret.val[0] = _mm_unpacklo_pi16(a, b); | |
ret.val[1] = _mm_unpackhi_pi16(a, b); | |
#else | |
for (size_t i = 0; i < 2; ++i) { | |
ret.val[0].u16[2 * i] = a.u16[i]; | |
ret.val[0].u16[2 * i + 1] = b.u16[i]; | |
} | |
for (size_t i = 0; i < 2; ++i) { | |
ret.val[1].u16[2 * i] = a.u16[2 + i]; | |
ret.val[1].u16[2 * i + 1] = b.u16[2 + i]; | |
} | |
#endif | |
return ret; | |
} | |
// --- | |
// Compare | |
// --- | |
// [vector] min(a, b) | |
/*static inline uint16x4_t uint16x4_min(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vmin_u16(a, b); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_min_pu16(a, a, _simd_mm_cmpge_pu8(b, a)); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
uint16_t ai = a.u16[i]; | |
uint16_t bi = b.u16[i]; | |
ret.u16[i] = ai < bi ? ai : bi; | |
} | |
#endif | |
return ret; | |
} | |
// [scalar] min(a, s) | |
static inline uint16x4_t uint16x4_mins(uint16x4_t a, uint16_t s) { | |
return uint16x4_min(a, uint16x4_inits(s)); | |
} | |
// [vector] max(a, b) | |
static inline uint16x4_t uint16x4_max(uint16x4_t a, uint16x4_t b) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vmax_u16(a, b); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_max_pu16(a, b); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
uint16_t ai = a.u16[i]; | |
uint16_t bi = b.u16[i]; | |
ret.u16[i] = ai < bi ? bi : ai; | |
} | |
#endif | |
return ret; | |
} | |
// [scalar] max(a, s) | |
static inline uint16x4_t uint16x4_maxs(uint16x4_t a, uint16_t s) { | |
return uint16x4_max(a, uint16x4_inits(s)); | |
} | |
// [vector] clamp(a, min, max) | |
static inline uint16x4_t uint16x4_clamp(uint16x4_t a, uint16x4_t min, uint16x4_t max) { | |
uint16x4_t ret; | |
#if defined(_SIMD_ARM_NEON) | |
ret = vmin_u16(vmax_u16(a, min), max); | |
#elif defined(_SIMD_X86_SSE) | |
ret = _mm_min_pu16(_mm_max_pu16(a, min), max); | |
#else | |
for (size_t i = 0; i < 4; ++i) { | |
uint16_t ai = a.u16[i]; | |
uint16_t mini = min.u16[i]; | |
uint16_t maxi = max.u16[i]; | |
ret.u16[i] = ai < mini ? mini : (ai < maxi ? ai : maxi); | |
} | |
#endif | |
return ret; | |
} | |
// [scalar] clamp(a, min, max) | |
static inline uint16x4_t uint16x4_clamps(uint16x4_t a, uint16_t min, uint16_t max) { | |
return uint16x4_clamp(a, uint16x4_inits(min), uint16x4_inits(max)); | |
}*/ | |
#ifdef __cplusplus | |
} | |
#endif | |
#endif // _SIMD_UINT16X4_H |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment