Skip to content

Instantly share code, notes, and snippets.

#include <stdint.h>
#include <atlfile.h>
#include <intrin.h>
#include <array>
// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics
#include "Intrinsics/avx.hpp"
#include "Intrinsics/avx2.hpp"
#include "Intrinsics/sse.hpp"
#include "Intrinsics/sse2.hpp"
using namespace Intrinsics::Avx;
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
#include "pch.h"
#include "RenderDeviceGLImpl.hpp"
#include "../../../../NetCore/ModeSet/API/eglContext.h"
#include <EGL/egl.h>
#include <EGL/eglext.h>
#define GL_GLEXT_PROTOTYPES
#include <GLES2/gl2.h>
#include <GLES2/gl2ext.h>
#include <libdrm/drm_fourcc.h>
#include <string>
static class C
{
static int div2(int i)
{
return i / 2;
}
}
#include <vector>
#include <assert.h>
#include <immintrin.h>
struct data
{
std::vector<int8_t> byteVals; // byteVals[i] == -128 means look in intVals
std::vector<int> intVals; // length is number of -128 values in byteVals
void push_back( int v )
using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
static class MotionDetectNeon
{
/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc )
int NeonTest( const uint8_t* lhs, const uint8_t* rhs, size_t count )
{
// If the length is not multiple of 16, you gonna need more code to handle the remainder
assert( 0 == ( count % 16 ) );
const uint8_t* const lhsEnd = lhs + count;
int32x4_t acc = vdupq_n_s32( 0 );
// The threshold is power of 2, using bits test for comparison for v >= 16
const uint8x16_t thresholdBitMask = vdupq_n_u8( 0xF0 );
// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with truncation.
// The function writes 20 bytes to the pointer.
inline void store_10x16_avx2( __m256i v, uint8_t* rdi )
{
__m256i low, high;
// Pack pairs of 10 bits into 20
low = _mm256_slli_epi16( v, 6 );
v = _mm256_blend_epi16( v, low, 0b01010101 );
// Now the vector contains 32-bit lanes with 20 payload bits / each in the middle of them
inline __m256i shiftLeftBytes_mem( const __m256i src, int i )
{
assert( i >= 0 && i < 32 );
// Align by 64 bytes so the complete array stays in a single cache line
alignas( 64 ) std::array<uint8_t, 64> buffer;
// Store zeros at offset 0
_mm256_store_si256( ( __m256i* )buffer.data(), _mm256_setzero_si256() );
// Store the source vector at offset 32
_mm256_store_si256( ( __m256i* )( buffer.data() + 32 ), src );
// Load back with the offset
// Store 10-bit pieces from each of the 16-bit lanes of the AVX2 vector.
// The function writes 20 bytes to the pointer.
inline void storeu_10x16( __m256i v, uint8_t* dest )
{
// Pack pairs of 10 bits into 20, in 32-bit lanes
__m256i high = _mm256_srli_epi32( v, 16 - 10 );
const __m256i low10 = _mm256_set1_epi32( ( 1 << 10 ) - 1 ); // Bitmask of 10 lowest bits in 32-bit lanes
__m256i low = _mm256_and_si256( v, low10 );
high = _mm256_andnot_si256( low10, high );
v = _mm256_or_si256( low, high );