danieloneill/obs_studio-arm.diff

## obs_studio-arm.diff
diff --git a/deps/media-playback/CMakeLists.txt b/deps/media-playback/CMakeLists.txt
index d58d121..ae938fa 100644
--- a/deps/media-playback/CMakeLists.txt
+++ b/deps/media-playback/CMakeLists.txt
@@ -28,6 +28,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le")
 			PUBLIC
 			-mvsx)
 	add_compile_definitions(NO_WARN_X86_INTRINSICS)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch).*")
 elseif(NOT MSVC)
 	target_compile_options(media-playback
 		PUBLIC
diff --git a/libobs-opengl/CMakeLists.txt b/libobs-opengl/CMakeLists.txt
index a116c99..645bb12 100644
--- a/libobs-opengl/CMakeLists.txt
+++ b/libobs-opengl/CMakeLists.txt
@@ -43,6 +43,7 @@ else() #This needs to change to be more specific to get ready for Wayland

 	set(libobs-opengl_PLATFORM_DEPS
 		${XCB_LIBRARIES}
+		${OPENGL_LIBRARIES}
 		${X11_XCB_LIBRARIES})

 	set(libobs-opengl_PLATFORM_SOURCES
diff --git a/libobs/CMakeLists.txt b/libobs/CMakeLists.txt
index 8681148..b439847 100644
--- a/libobs/CMakeLists.txt
+++ b/libobs/CMakeLists.txt
@@ -13,6 +13,7 @@ endif()

 if(UNIX)
 	if (NOT APPLE)
+		find_package(X11 REQUIRED)
 		find_package(X11_XCB REQUIRED)
 		find_package(XCB OPTIONAL_COMPONENTS XINPUT)
 		if (XCB_XINPUT_FOUND)
@@ -204,6 +205,7 @@ elseif(UNIX)
 		${X11_XCB_DEFINITIONS})
 	set(libobs_PLATFORM_DEPS
 		${libobs_PLATFORM_DEPS}
+		${X11_LIBRARIES}
 		${X11_XCB_LIBRARIES})

 	if(USE_XINPUT)
@@ -470,6 +472,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le")
 		PUBLIC
 			-mvsx)
 	add_compile_definitions(NO_WARN_X86_INTRINSICS)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch).*")
 elseif(NOT MSVC)
 	target_compile_options(libobs
 		PUBLIC
diff --git a/libobs/graphics/quat.h b/libobs/graphics/quat.h
index 23552ca..0610eb1 100644
--- a/libobs/graphics/quat.h
+++ b/libobs/graphics/quat.h
@@ -20,7 +20,12 @@
 #include "../util/c99defs.h"
 #include "math-defs.h"
 #include "vec3.h"
-#include <xmmintrin.h>
+#if defined(__aarch64__) || defined(__arm__)
+# include <arm_neon.h>
+# include "sse2neon.h"
+#else
+# include <xmmintrin.h>
+#endif

 /*
  * Quaternion math
diff --git a/libobs/graphics/vec3.h b/libobs/graphics/vec3.h
index d6b9f8d..4ea72d5 100644
--- a/libobs/graphics/vec3.h
+++ b/libobs/graphics/vec3.h
@@ -19,7 +19,12 @@

 #include "math-defs.h"
 #include "vec4.h"
-#include <xmmintrin.h>
+#if defined(__aarch64__) || defined(__arm__)
+# include <arm_neon.h>
+# include "sse2neon.h"
+#else
+# include <xmmintrin.h>
+#endif

 #ifdef __cplusplus
 extern "C" {
diff --git a/libobs/graphics/vec4.h b/libobs/graphics/vec4.h
index 61143cc..c66a0a5 100644
--- a/libobs/graphics/vec4.h
+++ b/libobs/graphics/vec4.h
@@ -18,7 +18,12 @@
 #pragma once

 #include "math-defs.h"
-#include <xmmintrin.h>
+#if defined(__aarch64__) || defined(__arm__)
+# include <arm_neon.h>
+# include "sse2neon.h"
+#else
+# include <xmmintrin.h>
+#endif

 #ifdef __cplusplus
 extern "C" {
@@ -76,7 +81,11 @@ static inline void vec4_mul(struct vec4 *dst, const struct vec4 *v1,
 static inline void vec4_div(struct vec4 *dst, const struct vec4 *v1,
 			    const struct vec4 *v2)
 {
+#if defined(__aarch64__) || defined(__arm__)
+	dst->m = vdivq_f32(v1->m, v2->m);
+#else
 	dst->m = _mm_div_ps(v1->m, v2->m);
+#endif
 }

 static inline void vec4_addf(struct vec4 *dst, const struct vec4 *v, float f)
diff --git a/libobs/media-io/format-conversion.c b/libobs/media-io/format-conversion.c
index f054064..8233698 100644
--- a/libobs/media-io/format-conversion.c
+++ b/libobs/media-io/format-conversion.c
@@ -16,8 +16,12 @@
 ******************************************************************************/

 #include "format-conversion.h"
-#include <xmmintrin.h>
-#include <emmintrin.h>
+#if defined(__aarch64__) || defined(__arm__)
+# include "sse2neon.h"
+#else
+# include <xmmintrin.h>
+# include <emmintrin.h>
+#endif

 /* ...surprisingly, if I don't use a macro to force inlining, it causes the
  * CPU usage to boost by a tremendous amount in debug builds. */
diff --git a/libobs/obs-audio-controls.c b/libobs/obs-audio-controls.c
index 81803ad..9106270 100644
--- a/libobs/obs-audio-controls.c
+++ b/libobs/obs-audio-controls.c
@@ -16,7 +16,12 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 #include <math.h>
-#include <xmmintrin.h>
+#if defined(__aarch64__) || defined(__arm__)
+# include "sse2neon.h"
+#else
+# include <xmmintrin.h>
+# include <emmintrin.h>
+#endif

 #include "util/threading.h"
 #include "util/bmem.h"
diff --git a/plugins/decklink/audio-repack.c b/plugins/decklink/audio-repack.c
index 8c59af5..c644969 100644
--- a/plugins/decklink/audio-repack.c
+++ b/plugins/decklink/audio-repack.c
@@ -1,6 +1,10 @@
 #include "audio-repack.h"

-#include <emmintrin.h>
+#if defined(__aarch64__) || defined(__arm__)
+# include "sse2neon.h"
+#else
+# include <emmintrin.h>
+#endif

 int check_buffer(struct audio_repack *repack, uint32_t frame_count)
 {

## sse2neon.h
// =====================================================================================
//
//       Filename:  sse2neon.h
//
//    Description:
//
//        Version:  1.0
//        Created:  2016年10月14日 21时58分08秒
//       Revision:  none
//       Compiler:  g++
//
//         Author:  Su Junjie (USTC), jjsu@email.ustc.edu.cn
//        Company:
//
// =====================================================================================

#ifndef SSE2NEON_H
#define SSE2NEON_H

//#define ENABLE_CPP_VERSION 0
#define GCC 1

#if GCC
#define INLINE		inline __attribute__((always_inline))
#else
#define INLINE		inline
#endif

#include <arm_neon.h>


typedef float32x4_t __m128;
typedef float32x2_t __m64;
typedef int32x4_t __m128i;

/*  expands to the following value */
#define _MM_SHUFFLE(z, y, x, w)    ( (z<<6) | (y<<4) | (x<<2) | w )
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
{ \
	__m128 tmp3, tmp2, tmp1, tmp0; \
	tmp0 = _mm_unpacklo_ps(row0, row1); \
	tmp2 = _mm_unpacklo_ps(row2, row3); \
	tmp1 = _mm_unpackhi_ps(row0, row1); \
	tmp3 = _mm_unpackhi_ps(row2, row3); \
	row0 = _mm_movelh_ps(tmp0, tmp2); \
	row1 = _mm_movehl_ps(tmp2, tmp0); \
	row2 = _mm_movelh_ps(tmp1, tmp3); \
	row3 = _mm_movehl_ps(tmp3, tmp1); \
}


/***************************************************************************
 *                max and min
 ***************************************************************************/

INLINE __m128i _mm_max_epu8 (__m128i a, __m128i b)
{
	return (__m128i)vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
}

INLINE __m128i _mm_max_epi16 (__m128i a, __m128i b)
{
	return (__m128i)vmaxq_s16((int16x8_t) a, (int16x8_t) b);
}

/* Return value
 * A 128-bit parameter that can be defined with the following equations:
 * r0 := (a0 > b0) ? a0 : b0
 * r1 := (a1 > b1) ? a1 : b1
 * r2 := (a2 > b2) ? a2 : b2
 * r3 := (a3 > b3) ? a3 : b3
 * */
INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
{
	return vmaxq_s32(a,b);
}


/* todo: when the input data contain the NaN. => different behave
	BUT, in actual use, NaN ?
Need MORE tests?
 */
INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
{
	return vmaxq_f32(a, b);
}

INLINE __m128i _mm_min_epu8 (__m128i a, __m128i b)
{
	return (__m128i)vminq_u8((uint8x16_t) a, (uint8x16_t) b);
}

INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
{
	return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b);
}

/* Return value
 * A 128-bit parameter that can be defined with the following equations:
 * r0 := (a0 < b0) ? a0 : b0
 * r1 := (a1 < b1) ? a1 : b1
 * r2 := (a2 < b2) ? a2 : b2
 * r3 := (a3 < b3) ? a3 : b3
 * */
INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
{
	return vminq_s32(a,b);
}

/* todo: when the input data contain the NaN. => different behave
	BUT, in actual use, NaN ?
Need MORE tests?
 */
INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
{
	return vminq_f32(a, b);
}

/***************************************************************************
 *                add and sub
 ***************************************************************************/
/* Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit integers of a and saturates.
 * r0 := UnsignedSaturate(a0 - b0)
 * r1 := UnsignedSaturate(a1 - b1)
 * ...
 * r15 := UnsignedSaturate(a15 - b15)
 * */
INLINE __m128i _mm_subs_epu8 (__m128i a, __m128i b)
{
	return (__m128i)vqsubq_u8((uint8x16_t) a, (uint8x16_t) b);
}

INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
{
	return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b);
}

INLINE __m128i _mm_sub_epi16 (__m128i a, __m128i b)
{
	return (__m128i)vsubq_s16((int16x8_t) a, (int16x8_t) b);
}

INLINE __m128i _mm_adds_epu16 (__m128i a, __m128i b)
{
	return (__m128i)vqaddq_u16((uint16x8_t) a, (uint16x8_t) b);
}

INLINE __m128i _mm_subs_epu16 (__m128i a, __m128i b)
{
	return (__m128i)vqsubq_u16((uint16x8_t) a, (uint16x8_t) b);
}

/* Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b and saturates.
 * r0 := SignedSaturate(a0 + b0)
 * r1 := SignedSaturate(a1 + b1)
 * ...
 * r7 := SignedSaturate(a7 + b7)
 * */
INLINE __m128i _mm_adds_epi16 (__m128i a, __m128i b)
{
	return (__m128i)vqaddq_s16((int16x8_t)a, (int16x8_t)b);
}

/* Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers of a and saturates.
 * r0 := SignedSaturate(a0 - b0)
 * r1 := SignedSaturate(a1 - b1)
 * ...
 * r7 := SignedSaturate(a7 - b7)
 * */
INLINE __m128i _mm_subs_epi16 (__m128i a, __m128i b)
{
	return (__m128i)vqsubq_s16((int16x8_t) a, (int16x8_t) b);
}
/* Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b.
 * r0 := a0 + b0
 * r1 := a1 + b1
 * r2 := a2 + b2
 * r3 := a3 + b3
 * */
INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
{
	return vaddq_s32(a, b);
}

INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
{
	return (__m128i)vaddq_s64((int64x2_t)a, (int64x2_t)b);
}

/* Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a.
 * r0 := a0 - b0
 * r1 := a1 - b1
 * r2 := a2 - b2
 * r3 := a3 - b3
 * */
INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
{
	return vsubq_s32(a, b);
}

/* Adds the four single-precision, floating-point values of a and b.
 * r0 := a0 + b0
 * r1 := a1 + b1
 * r2 := a2 + b2
 * r3 := a3 + b3
 * */
INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
{
	return vaddq_f32(a, b);
}

/* Subtracts the four single-precision, floating-point values of a and b.
 * r0 := a0 - b0
 * r1 := a1 - b1
 * r2 := a2 - b2
 * r3 := a3 - b3
 * */
INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
{
	return vsubq_f32(a, b);
}

/* The haddps instruction performs a horizontal add, meaning that adjacent elements in the same operand are added together. Each 128-bit argument is considered as four 32-bit floating-point elements, numbered from 0 to 3, with 3 being the high-order element. The result of the operation on operand a (A3, A2, A1, A0) and operand b (B3, B2, B1, B0) is (B3 + B2, B1 + B0, A3 + A2, A1 + A0).
 * This routine is only available as an intrinsic*/
INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
{
	return vcombine_f32(vpadd_f32(vget_low_f32(a), vget_high_f32(a)), vpadd_f32(vget_low_f32(b), vget_high_f32(b)));
}

/***************************************************************************
 *                Multiply
 ***************************************************************************/

/* Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b.
 * r0 := (a0 * b0)[31:16]
 * r1 := (a1 * b1)[31:16]
 * ...
 * r7 := (a7 * b7)[31:16]
 * */
INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
{
	return (__m128i)vshrq_n_s16(vqdmulhq_s16((int16x8_t)a, (int16x8_t)b), 1);
}

/* Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b.
 * r0 := (a0 * b0)[15:0]
 * r1 := (a1 * b1)[15:0]
 * ...
 * r7 := (a7 * b7)[15:0]
 * */
INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
{
	return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b);
}

/* Multiplies the four single-precision, floating-point values of a and b.
 * r0 := a0 * b0
 * r1 := a1 * b1
 * r2 := a2 * b2
 * r3 := a3 * b3
 * */
INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
{
	//todo:
	//NEON:(-2.33512e-28) * (-2.13992e-13)=0
	//SSE: (-2.33512e-28) * (-2.13992e-13)=4.99689e-41
//	return vmulq_f32(a, b);
	__m128 ret;
	ret[0] = a[0]*b[0];
	ret[1] = a[1]*b[1];
	ret[2] = a[2]*b[2];
	ret[3] = a[3]*b[3];
	return ret;
}

/* Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b.
 * __m128i _mm_madd_epi16 (__m128i a, __m128i b);
 * PMADDWD
 * Return Value
 * Adds the signed 32-bit integer results pairwise and packs the 4 signed 32-bit integer results.
 * r0 := (a0 * b0) + (a1 * b1)
 * r1 := (a2 * b2) + (a3 * b3)
 * r2 := (a4 * b4) + (a5 * b5)
 * r3 := (a6 * b6) + (a7 * b7)
 * */
INLINE __m128i _mm_madd_epi16 (__m128i a, __m128i b)
{
	int32x4_t r_l = vmull_s16(vget_low_s16((int16x8_t)a), vget_low_s16((int16x8_t)b));
	int32x4_t r_h = vmull_s16(vget_high_s16((int16x8_t)a), vget_high_s16((int16x8_t)b));
	return vcombine_s32(vpadd_s32(vget_low_s32(r_l), vget_high_s32(r_l)), vpadd_s32(vget_low_s32(r_h), vget_high_s32(r_h)));
}

/***************************************************************************
 *                absdiff
 ***************************************************************************/
//#define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))

/* Computes the absolute difference of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b.
 * __m128i _mm_sad_epu8 (__m128i a, __m128i b);
 * PSADBW
 * Return Value
 * Sums the upper 8 differences and lower 8 differences and packs the resulting 2 unsigned 16-bit integers into the upper and lower 64-bit elements.
 * r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7)
 * r1 := 0x0 ; r2 := 0x0 ; r3 := 0x0
 * r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15)
 * r5 := 0x0 ; r6 := 0x0 ; r7 := 0x0
 * */
INLINE __m128i _mm_sad_epu8 (__m128i a, __m128i b)
{
	uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
	uint16_t r0 = t[0]+t[1]+t[2]+t[3];
	uint16_t r4 = t[4]+t[5]+t[6]+t[7];
	uint16x8_t r = vsetq_lane_u16(r0,vdupq_n_u16(0),0);
	return (__m128i)vsetq_lane_u16(r4,r,4);
}
/***************************************************************************
 *                divides
 ***************************************************************************/
/* r0 := a0 / b0
 * r1 := a1 / b1
 * r2 := a2 / b2
 * r3 := a3 / b3
 * */
INLINE __m128 _mm_div_ps(__m128 a, __m128 b )
{
	// get an initial estimate of 1/b.
	float32x4_t reciprocal = vrecpeq_f32(b);

	// use a couple Newton-Raphson steps to refine the estimate.  Depending on your
	// application's accuracy requirements, you may be able to get away with only
	// one refinement (instead of the two used here).  Be sure to test!
	reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
//	reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);

	// and finally, compute a/b = a*(1/b)
	float32x4_t result = vmulq_f32(a,reciprocal);
	return result;
}


/* Computes the square roots of the four single-precision, floating-point values of a.
 * r0 := sqrt(a0)
 * r1 := sqrt(a1)
 * r2 := sqrt(a2)
 * r3 := sqrt(a3)
 * */
INLINE __m128 _mm_sqrt_ps(__m128 in)
{
	__m128 recipsq = vrsqrteq_f32(in);
	__m128 sq = vrecpeq_f32(recipsq);
	// ??? use step versions of both sqrt and recip for better accuracy?
	//precision loss
//	__m128 recipsq = vrsqrtsq_f32(in,vdupq_n_f32(1.0));
//	__m128 sq = vrecpsq_f32(recipsq,vdupq_n_f32(1.0));
	return sq;
}
/***************************************************************************
 *                logic
 ***************************************************************************/

/* Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
 * r := a | b
 * */
INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
{
	return (__m128i)vorrq_s32(a, b);
}

/* Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.
 * r := a ^ b
 * */
INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
{
	return veorq_s32(a, b);
}

/* Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a.
 * r := (~a) & b
 * */
INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
{
	return vbicq_s32(b, a); // *NOTE* argument swap
}

/* Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b.
 * r0 := ~a0 & b0
 * r1 := ~a1 & b1
 * r2 := ~a2 & b2
 * r3 := ~a3 & b3
 * */
INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
{
	return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap
}

/* Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.
 * r := a & b
 * */
INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
{
	return vandq_s32(a, b);
}

/* Computes the bitwise AND of the four single-precision, floating-point values of a and b.
 * r0 := a0 & b0
 * r1 := a1 & b1
 * r2 := a2 & b2
 * r3 := a3 & b3
 * */
INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
{
	return (__m128)vandq_s32((__m128i)a, (__m128i)b);
}

/* Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for greater than.
 * r0 := (a0 > b0) ? 0xff : 0x0
 * r1 := (a1 > b1) ? 0xff : 0x0
 * ...
 * r15 := (a15 > b15) ? 0xff : 0x0
 */
INLINE __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
{
	return (__m128i)vcgtq_s8((int8x16_t) a,( int8x16_t) b);
}

/* Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers in b for greater than.
 * r0 := (a0 > b0) ? 0xffff : 0x0
 * r1 := (a1 > b1) ? 0xffff : 0x0
 * ...
 * r7 := (a7 > b7) ? 0xffff : 0x0
 * */
INLINE __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
{
	return (__m128i)vcgtq_s16((int16x8_t) a,( int16x8_t) b);
}

/* Compares for greater than.
 * r0 := (a0 > b0) ? 0xffffffff : 0x0
 * r1 := (a1 > b1) ? 0xffffffff : 0x0
 * r2 := (a2 > b2) ? 0xffffffff : 0x0
 * r3 := (a3 > b3) ? 0xffffffff : 0x0
 * */
INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
{
	return (__m128)vcgtq_f32(a, b);
}

/* Compares for less than or equal.
 * r0 := (a0 <= b0) ? 0xffffffff : 0x0
 * r1 := (a1 <= b1) ? 0xffffffff : 0x0
 * r2 := (a2 <= b2) ? 0xffffffff : 0x0
 * r3 := (a3 <= b3) ? 0xffffffff : 0x0
 * */
INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
{
	return (__m128)vcleq_f32(a, b);
}
/***************************************************************************
 *                load and store
 ***************************************************************************/

INLINE __m128i _mm_load_si128(const __m128i *p)
{
	return vld1q_s32((int32_t *)p);
}
INLINE void _mm_store_si128(__m128i *p, __m128i a )
{
	vst1q_s32((int32_t*) p,a);
}

INLINE __m128 _mm_load_ps(const float * p)
{
	return vld1q_f32(p);
}

INLINE void _mm_store_ps(float *p, __m128 a)
{
	vst1q_f32(p, a);
}

#define _mm_loadu_si128 _mm_load_si128
#define _mm_storeu_si128 _mm_store_si128

#define _mm_loadu_ps _mm_load_ps
#define _mm_storeu_ps _mm_store_ps

INLINE __m128i _mm_loadl_epi64(__m128i const*p)
{
	/* Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result. */
	return vcombine_s32(vld1_s32((int32_t const *)p),vcreate_s32(0));
}

INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
{
	/* Reads the lower 64 bits of b and stores them into the lower 64 bits of a. */
	//
	//*a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0);
	//vst1_s64( (int64_t *) a, vget_low_s64((int64x2_t)b));
	vst1_s32( (int32_t *) a, vget_low_s32((int32x4_t)b));
}

/* Sets the lower two single-precision, floating-point values with 64 bits of data loaded from the address p; the upper two values are passed through from a.
 * __m128 _mm_loadl_pi( __m128 a , __m64 * p );
 * MOVLPS reg, mem
 * Return Value
 *  r0 := *p0
 *  r1 := *p1
 *  r2 := a2
 *  r3 := a3
 *  */
INLINE __m128 _mm_loadl_pi( __m128 a , __m64 const * p )
{
	return vcombine_f32(vld1_f32((float32_t const *)p),vget_high_f32(a));
}

/* Stores the lower two single-precision, floating-point values of a to the address p.
 * *p0 := b0
 * *p1 := b1
 * */
INLINE void _mm_storel_pi( __m64 * p , __m128 a )
{
	vst1_f32((float32_t *)p, vget_low_f32((float32x4_t)a));
}

INLINE __m128 _mm_load_ss(const float * p)
{
	/* Loads an single-precision, floating-point value into the low word and clears the upper three words. */
	__m128 result = vdupq_n_f32(0);
	return vsetq_lane_f32(*p, result, 0);
}

INLINE void _mm_store_ss(float *p, __m128 a)
{
	/* Stores the lower single-precision, floating-point value. */
	vst1q_lane_f32(p, a, 0);
}
/***************************************************************************
 *                SET
 ***************************************************************************/
/* Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits.
 * r0 := a
 * r1 := 0x0 ; r2 := 0x0 ; r3 := 0x0
 * */
INLINE __m128i _mm_cvtsi32_si128(int a)
{
	__m128i result = vdupq_n_s32(0);
	return vsetq_lane_s32(a, result, 0);
}

/* Sets the 16 signed 8-bit integer values to b.
 * r0 := b
 * r1 := b
 * ...
 * r15 := b
 * */
INLINE __m128i _mm_set1_epi8 (char b)
{
	return (__m128i)vdupq_n_s8((int8_t)b);
}

/* Sets the 8 signed 16-bit integer values to w.
 * r0 := w
 * r1 := w
 * ...
 * r7 := w
 * */
INLINE __m128i _mm_set1_epi16 (short w)
{
	return (__m128i)vdupq_n_s16((int16_t)w);
}

/* Sets the 4 signed 32-bit integer values to i.
 * r0 := i
 * r1 := i
 * r2 := i
 * r3 := I
 * */
INLINE __m128i _mm_set1_epi32(int i)
{
	return vdupq_n_s32(i);
}
/* Sets the four single-precision, floating-point values to w
 * r0 := r1 := r2 := r3 := w
 * */
INLINE __m128 _mm_set1_ps(float w)
{
	return vdupq_n_f32(w);
}
#define _mm_set_ps1 _mm_set1_ps

INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
{
	float __attribute__((aligned(16))) data[4] = {x,y,z,w};
	return vld1q_f32(data);
}

/* Sets the 8 signed 16-bit integer values in reverse order.
 * __m128i _mm_setr_epi16 (short w0, short w1,    short w2, short w3,   short w4, short w5,   short w6, short w7);
 * (composite)
 * Return Value
 *  r0 := w0
 *  r1 := w1
 *  ...
 *  r7 := w7
 *  */
INLINE __m128i _mm_setr_epi16 (short w0, short w1,    short w2, short w3,   short w4, short w5,   short w6, short w7)
{
	short __attribute__ ((aligned (16))) data[8] = { w0, w1, w2, w3, w4, w5, w6, w7 };
	return (__m128i)vld1q_s16((int16_t*)data);
}

//todo ~~~~~~~~~~~~~~~~~~~~~~~~~
/* Snhuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. */
INLINE __m128i _mm_shuffle_epi32 (__m128i a, int imm)
{
	__m128i ret;
	switch (imm)
	{
		case 0 :
			return vdupq_n_s32(vgetq_lane_s32(a, 0));
			break;
		default:
			// __m128i ret = vdupq_n_s32(0);
			ret[0] = a[imm & 0x3];
			ret[1] = a[(imm >> 2) & 0x3];
			ret[2] = a[(imm >> 4) & 0x03];
			ret[3] = a[(imm >> 6) & 0x03];
			return ret;
	}
}

INLINE __m128i _mm_shufflelo_epi16 (__m128i a, int imm)
{
	// FIXME: Do this properly, this doesn't currently work:
	return _mm_shuffle_epi32 (a, imm);
}

//todo ~~~~~~~~~~~~~~~~~~~~~~~~~
/* Selects four specific single-precision, floating-point values from a and b, based on the mask i. */
INLINE __m128 _mm_shuffle_ps(__m128 a , __m128 b , int i )
{
	__m128 ret;
	switch (i)
	{
//		case 0 :
//			return
//			break;
		default:
			ret[0] = a[i & 0x3];
			ret[1] = a[(i >> 2) & 0x3];
			ret[2] = b[(i >> 4) & 0x03];
			ret[3] = b[(i >> 6) & 0x03];
			return ret;
	}
}
/***************************************************************************
 *                GET
 ***************************************************************************/
INLINE int _mm_cvtsi128_si32(__m128i a)
{
	/* Moves the least significant 32 bits of a to a 32-bit integer. */
	return vgetq_lane_s32(a, 0);
}

/* Sets the 128-bit value to zero.
 * r := 0x0
 * */
INLINE __m128i _mm_setzero_si128()
{
	return vdupq_n_s32(0);
}

/* Clears the four single-precision, floating-point values.
 * r0 := r1 := r2 := r3 := 0.0
 * */
INLINE __m128 _mm_setzero_ps(void)
{
	return vdupq_n_f32(0);
}
/***************************************************************************
 *                convert
 ***************************************************************************/

/* Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates.
 * r0 := SignedSaturate(a0)
 * r1 := SignedSaturate(a1)
 * r2 := SignedSaturate(a2)
 * r3 := SignedSaturate(a3)
 * r4 := SignedSaturate(b0)
 * r5 := SignedSaturate(b1)
 * r6 := SignedSaturate(b2)
 * r7 := SignedSaturate(b3)
 * */
INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
{
	return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
}

/* Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.
 * r0 := a8 ; r1 := b8
 * r2 := a9 ; r3 := b9
 * ...
 * r14 := a15 ; r15 := b15
 * */
INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
{
	int8x8_t a_h = vget_high_s8((int8x16_t)a);
	int8x8_t b_h = vget_high_s8((int8x16_t)b);
	int8x8x2_t r = vzip_s8(a_h, b_h);
	return (__m128i)vcombine_s8(r.val[0],r.val[1]);
}

/* Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.
 * r0 := a0 ; r1 := b0
 * r2 := a1 ; r3 := b1
 * ...
 * r14 := a7 ; r15 := b7
 * */
INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
{
	int8x8_t a_l = vget_low_s8((int8x16_t)a);
	int8x8_t b_l = vget_low_s8((int8x16_t)b);
	int8x8x2_t r = vzip_s8(a_l, b_l);
	return (__m128i)vcombine_s8(r.val[0],r.val[1]);
}

/* Interleaves the lower 2 signed or unsigned 32-bit integers in a with the lower 2 signed or unsigned 32-bit integers in b.
 * r0 := a0 ; r1 := b0
 * r2 := a1 ; r3 := b1
 * */
INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
{
	int32x2_t a_l = vget_low_s32((int32x4_t)a);
	int32x2_t b_l = vget_low_s32((int32x4_t)b);
	int32x2x2_t r = vzip_s32(a_l, b_l);
	return (__m128i)vcombine_s32(r.val[0],r.val[1]);
}

INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
{
	int64x1_t a_l = vget_low_s64((int64x2_t)a);
	int64x1_t b_l = vget_low_s64((int64x2_t)b);
	return (__m128i)vcombine_s64(a_l, b_l);
}

/* Interleaves the upper signed or unsigned 64-bit integer in a with the upper signed or unsigned 64-bit integer in b.
 * __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b);
 * PUNPCKHQDQ
 * Return Value
 *  r0 := a1 ; r1 := b1
 *  */
INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
{
	int64x1_t a_h = vget_high_s64((int64x2_t)a);
	int64x1_t b_h = vget_high_s64((int64x2_t)b);
	return (__m128i)vcombine_s64(a_h, b_h);
}

/* Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.
 * r0 := a4 ; r1 := b4
 * r2 := a5 ; r3 := b5
 * r4 := a6 ; r5 := b6
 * r6 := a7 ; r7 := b7
 * */
INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
{
	int16x4_t a_h = vget_high_s16((int16x8_t)a);
	int16x4_t b_h = vget_high_s16((int16x8_t)b);
	int16x4x2_t result = vzip_s16(a_h, b_h);
	return (__m128i)vcombine_s16(result.val[0], result.val[1]);
}

/* Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.
 * r0 := a0 ; r1 := b0
 * r2 := a1 ; r3 := b1
 * r4 := a2 ; r5 := b2
 * r6 := a3 ; r7 := b3
 * */
INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
{
	int16x4_t a_l = vget_low_s16((int16x8_t)a);
	int16x4_t b_l = vget_low_s16((int16x8_t)b);
	int16x4x2_t result = vzip_s16(a_l, b_l);
	return (__m128i)vcombine_s16(result.val[0], result.val[1]);
}

INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
{
	int32x2_t a1 = vget_high_s32(a);
	int32x2_t b1 = vget_high_s32(b);

	int32x2x2_t result = vzip_s32(a1, b1);

	return vcombine_s32(result.val[0], result.val[1]);
}

/* Selects and interleaves the lower two single-precision, floating-point values from a and b.
 * r0 := a0
 * r1 := b0
 * r2 := a1
 * r3 := b1
 * */
INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
{
	float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b));
	return vcombine_f32(result.val[0], result.val[1]);
}

/* Selects and interleaves the upper two single-precision, floating-point values from a and b.
 * r0 := a2
 * r1 := b2
 * r2 := a3
 * r3 := b3
 * */
INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
{
	float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b));
	return vcombine_f32(result.val[0], result.val[1]);
}

INLINE __m128 _mm_cvtepi32_ps(__m128i a)
{
	return vcvtq_f32_s32(a);
}
/* Converts the four single-precision, floating-point values of a to signed 32-bit integer values.
 * r0 := (int) a0
 * r1 := (int) a1
 * r2 := (int) a2
 * r3 := (int) a3
 * */
INLINE __m128i _mm_cvtps_epi32(__m128 a)
{
	//todo:precision
	//NaN -0
	//
	return vcvtq_s32_f32(a);
//	__m128i ret;
//	ret[0] = (int64_t)a[0];
//	ret[1] = (int64_t)a[1];
//	ret[2] = (int64_t)a[2];
//	ret[3] = (int64_t)a[3];
//	return ret;
}

/* Packs the 16 signed 16-bit integers from a and b into 8-bit unsigned integers and saturates.
 *
 * r0 := UnsignedSaturate(a0)
 * r1 := UnsignedSaturate(a1)
 * ...
 * r7 := UnsignedSaturate(a7)
 * r8 := UnsignedSaturate(b0)
 * r9 := UnsignedSaturate(b1)
 * ...
 * r15 := UnsignedSaturate(b7)
 * */
INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
{
	return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
}

/* Moves the lower two single-precision, floating-point values of b to the upper two single-precision, floating-point values of the result.
 * r3 := b1
 * r2 := b0
 * r1 := a1
 * r0 := a0
 * */
INLINE __m128 _mm_movelh_ps( __m128 a, __m128 b )
{
	return vcombine_f32(vget_low_f32(a),vget_low_f32(b));
}

/* The upper two single-precision, floating-point values of a are passed through to the result.
 * r3 := a3
 * r2 := a2
 * r1 := b3
 * r0 := b2
 * */
INLINE __m128 _mm_movehl_ps( __m128 a, __m128 b )
{
	return vcombine_f32(vget_high_f32(b),vget_high_f32(a));
}

/***************************************************************************
 *                shift
 ***************************************************************************/
/* Shifts the 4 signed 32-bit integers in a right by count bits while shifting in the sign bit.
* r0 := a0 >> count
* r1 := a1 >> count
* r2 := a2 >> count
* r3 := a3 >> count
* immediate ,use  #define _mm_srai_epi32(a, imm) vshrq_n_s32(a, imm)
* */
INLINE __m128i _mm_srai_epi32 (__m128i a, int count)
{
//	return vshrq_n_s32(a, count);
//	todo :
//	if immediate
	return vshlq_s32(a, vdupq_n_s32(-count));
}

/* Shifts the 8 signed 16-bit integers in a right by count bits while shifting in the sign bit.
 *  r0 := a0 >> count
 *  r1 := a1 >> count
 *  ...
 *  r7 := a7 >> count
 *  */
INLINE __m128i _mm_srai_epi16 (__m128i a, int count)
{
//	return vshrq_n_s16(a, count);
//	todo :
//	if immediate
	return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(-count));
}

/* Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while shifting in zeros.
 * r0 := a0 << count
 * r1 := a1 << count
 * ...
 * r7 := a7 << count
 * */
INLINE __m128i _mm_slli_epi16(__m128i a, int count)
{
//	todo :
//	if immediate
	return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(count));
}

/* Shifts the 8 signed or unsigned 16-bit integers in a right by count bits while shifting in zeros.
 * r0 := srl(a0, count)
 * r1 := srl(a1, count)
 * ...
 * r7 := srl(a7, count)
 * */
INLINE __m128i _mm_srli_epi16(__m128i a, int count)
{
//	todo :
//	if immediate
	return (__m128i)vshlq_u16((uint16x8_t)a, vdupq_n_s16(-count));
}

/* Shifts the 128-bit value in a right by imm bytes while shifting in zeros. imm must be an immediate.
 * r := srl(a, imm*8)
 * */
#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm))

/* Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.
 * r := a << (imm * 8)*/
//todo ::imm =0, compile error
#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm))

#endif
	diff --git a/deps/media-playback/CMakeLists.txt b/deps/media-playback/CMakeLists.txt
	index d58d121..ae938fa 100644
	--- a/deps/media-playback/CMakeLists.txt
	+++ b/deps/media-playback/CMakeLists.txt
	@@ -28,6 +28,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc\|ppc)64le")
	PUBLIC
	-mvsx)
	add_compile_definitions(NO_WARN_X86_INTRINSICS)
	+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm\|aarch).*")
	elseif(NOT MSVC)
	target_compile_options(media-playback
	PUBLIC
	diff --git a/libobs-opengl/CMakeLists.txt b/libobs-opengl/CMakeLists.txt
	index a116c99..645bb12 100644
	--- a/libobs-opengl/CMakeLists.txt
	+++ b/libobs-opengl/CMakeLists.txt
	@@ -43,6 +43,7 @@ else() #This needs to change to be more specific to get ready for Wayland

	set(libobs-opengl_PLATFORM_DEPS
	${XCB_LIBRARIES}
	+ ${OPENGL_LIBRARIES}
	${X11_XCB_LIBRARIES})

	set(libobs-opengl_PLATFORM_SOURCES
	diff --git a/libobs/CMakeLists.txt b/libobs/CMakeLists.txt
	index 8681148..b439847 100644
	--- a/libobs/CMakeLists.txt
	+++ b/libobs/CMakeLists.txt
	@@ -13,6 +13,7 @@ endif()

	if(UNIX)
	if (NOT APPLE)
	+ find_package(X11 REQUIRED)
	find_package(X11_XCB REQUIRED)
	find_package(XCB OPTIONAL_COMPONENTS XINPUT)
	if (XCB_XINPUT_FOUND)
	@@ -204,6 +205,7 @@ elseif(UNIX)
	${X11_XCB_DEFINITIONS})
	set(libobs_PLATFORM_DEPS
	${libobs_PLATFORM_DEPS}
	+ ${X11_LIBRARIES}
	${X11_XCB_LIBRARIES})

	if(USE_XINPUT)
	@@ -470,6 +472,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc\|ppc)64le")
	PUBLIC
	-mvsx)
	add_compile_definitions(NO_WARN_X86_INTRINSICS)
	+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm\|aarch).*")
	elseif(NOT MSVC)
	target_compile_options(libobs
	PUBLIC
	diff --git a/libobs/graphics/quat.h b/libobs/graphics/quat.h
	index 23552ca..0610eb1 100644
	--- a/libobs/graphics/quat.h
	+++ b/libobs/graphics/quat.h
	@@ -20,7 +20,12 @@
	#include "../util/c99defs.h"
	#include "math-defs.h"
	#include "vec3.h"
	-#include <xmmintrin.h>
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+# include <arm_neon.h>
	+# include "sse2neon.h"
	+#else
	+# include <xmmintrin.h>
	+#endif

	/*
	* Quaternion math
	diff --git a/libobs/graphics/vec3.h b/libobs/graphics/vec3.h
	index d6b9f8d..4ea72d5 100644
	--- a/libobs/graphics/vec3.h
	+++ b/libobs/graphics/vec3.h
	@@ -19,7 +19,12 @@

	#include "math-defs.h"
	#include "vec4.h"
	-#include <xmmintrin.h>
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+# include <arm_neon.h>
	+# include "sse2neon.h"
	+#else
	+# include <xmmintrin.h>
	+#endif

	#ifdef __cplusplus
	extern "C" {
	diff --git a/libobs/graphics/vec4.h b/libobs/graphics/vec4.h
	index 61143cc..c66a0a5 100644
	--- a/libobs/graphics/vec4.h
	+++ b/libobs/graphics/vec4.h
	@@ -18,7 +18,12 @@
	#pragma once

	#include "math-defs.h"
	-#include <xmmintrin.h>
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+# include <arm_neon.h>
	+# include "sse2neon.h"
	+#else
	+# include <xmmintrin.h>
	+#endif

	#ifdef __cplusplus
	extern "C" {
	@@ -76,7 +81,11 @@ static inline void vec4_mul(struct vec4 dst, const struct vec4 v1,
	static inline void vec4_div(struct vec4 dst, const struct vec4 v1,
	const struct vec4 *v2)
	{
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+ dst->m = vdivq_f32(v1->m, v2->m);
	+#else
	dst->m = _mm_div_ps(v1->m, v2->m);
	+#endif
	}

	static inline void vec4_addf(struct vec4 dst, const struct vec4 v, float f)
	diff --git a/libobs/media-io/format-conversion.c b/libobs/media-io/format-conversion.c
	index f054064..8233698 100644
	--- a/libobs/media-io/format-conversion.c
	+++ b/libobs/media-io/format-conversion.c
	@@ -16,8 +16,12 @@
	******************************************************************************/

	#include "format-conversion.h"
	-#include <xmmintrin.h>
	-#include <emmintrin.h>
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+# include "sse2neon.h"
	+#else
	+# include <xmmintrin.h>
	+# include <emmintrin.h>
	+#endif

	/* ...surprisingly, if I don't use a macro to force inlining, it causes the
	* CPU usage to boost by a tremendous amount in debug builds. */
	diff --git a/libobs/obs-audio-controls.c b/libobs/obs-audio-controls.c
	index 81803ad..9106270 100644
	--- a/libobs/obs-audio-controls.c
	+++ b/libobs/obs-audio-controls.c
	@@ -16,7 +16,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
	*/

	#include <math.h>
	-#include <xmmintrin.h>
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+# include "sse2neon.h"
	+#else
	+# include <xmmintrin.h>
	+# include <emmintrin.h>
	+#endif

	#include "util/threading.h"
	#include "util/bmem.h"
	diff --git a/plugins/decklink/audio-repack.c b/plugins/decklink/audio-repack.c
	index 8c59af5..c644969 100644
	--- a/plugins/decklink/audio-repack.c
	+++ b/plugins/decklink/audio-repack.c
	@@ -1,6 +1,10 @@
	#include "audio-repack.h"

	-#include <emmintrin.h>
	+#if defined(__aarch64__) \|\| defined(__arm__)
	+# include "sse2neon.h"
	+#else
	+# include <emmintrin.h>
	+#endif

	int check_buffer(struct audio_repack *repack, uint32_t frame_count)
	{