Konstantin Const-me

## vrcpps-errors.cpp
#include <stdint.h>
#include <atlfile.h>
#include <intrin.h>
#include <array>
// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics
#include "Intrinsics/avx.hpp"
#include "Intrinsics/avx2.hpp"
#include "Intrinsics/sse.hpp"
#include "Intrinsics/sse2.hpp"
using namespace Intrinsics::Avx;

## matVecMult81.cpp
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
	// 30 vector registers in total; ARM64 has 32 of them, so we're good.
	float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
	float32x4_t mat0, mat1, mat2, mat3, mat4;
	float32x4_t res0, res1, res2, res3;

	vec80 = mat4 = vdupq_n_f32( 0.0f );
	// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 )      \

## importNv12Texture.cpp
#include "pch.h"
#include "RenderDeviceGLImpl.hpp"
#include "../../../../NetCore/ModeSet/API/eglContext.h"
#include <EGL/egl.h>
#include <EGL/eglext.h>
#define GL_GLEXT_PROTOTYPES
#include <GLES2/gl2.h>
#include <GLES2/gl2ext.h>
#include <libdrm/drm_fourcc.h>
#include <string>

## div2-cs.cs
static class C
{
    static int div2(int i)
    {
        return i / 2;
    }
}

## simIntegersAvx2.cpp
#include <vector>
#include <assert.h>
#include <immintrin.h>

struct data
{
	std::vector<int8_t> byteVals; // byteVals[i] == -128 means look in intVals
	std::vector<int> intVals; // length is number of -128 values in byteVals

	void push_back( int v )

## NeonTest.cs
using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;

static class MotionDetectNeon
{
	/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary>
	[MethodImpl( MethodImplOptions.AggressiveInlining )]
	static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc )

## NeonTest.cpp
int NeonTest( const uint8_t* lhs, const uint8_t* rhs, size_t count )
{
	// If the length is not multiple of 16, you gonna need more code to handle the remainder
	assert( 0 == ( count % 16 ) );

	const uint8_t* const lhsEnd = lhs + count;
	int32x4_t acc = vdupq_n_s32( 0 );

	// The threshold is power of 2, using bits test for comparison for v >= 16
	const uint8x16_t thresholdBitMask = vdupq_n_u8( 0xF0 );

## store_10x16_avx2.cpp
// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with truncation.
// The function writes 20 bytes to the pointer.
inline void store_10x16_avx2( __m256i v, uint8_t* rdi )
{
	__m256i low, high;
	// Pack pairs of 10 bits into 20
	low = _mm256_slli_epi16( v, 6 );
	v = _mm256_blend_epi16( v, low, 0b01010101 );

	// Now the vector contains 32-bit lanes with 20 payload bits / each in the middle of them

## shiftLeftBytes_mem.cpp
inline __m256i shiftLeftBytes_mem( const __m256i src, int i )
{
	assert( i >= 0 && i < 32 );
	// Align by 64 bytes so the complete array stays in a single cache line
	alignas( 64 ) std::array<uint8_t, 64> buffer;
	// Store zeros at offset 0
	_mm256_store_si256( ( __m256i* )buffer.data(), _mm256_setzero_si256() );
	// Store the source vector at offset 32
	_mm256_store_si256( ( __m256i* )( buffer.data() + 32 ), src );
	// Load back with the offset

## storeu_10x16.cpp
// Store 10-bit pieces from each of the 16-bit lanes of the AVX2 vector.
// The function writes 20 bytes to the pointer.
inline void storeu_10x16( __m256i v, uint8_t* dest )
{
	// Pack pairs of 10 bits into 20, in 32-bit lanes
	__m256i high = _mm256_srli_epi32( v, 16 - 10 );
	const __m256i low10 = _mm256_set1_epi32( ( 1 << 10 ) - 1 ); // Bitmask of 10 lowest bits in 32-bit lanes
	__m256i low = _mm256_and_si256( v, low10 );
	high = _mm256_andnot_si256( low10, high );
	v = _mm256_or_si256( low, high );
	#include <stdint.h>
	#include <atlfile.h>
	#include <intrin.h>
	#include <array>
	// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics
	#include "Intrinsics/avx.hpp"
	#include "Intrinsics/avx2.hpp"
	#include "Intrinsics/sse.hpp"
	#include "Intrinsics/sse2.hpp"
	using namespace Intrinsics::Avx;
	void matVecMult81( float pDst, const float pMat, const float *pVec, size_t nRows = 90000 )
	{
	// 30 vector registers in total; ARM64 has 32 of them, so we're good.
	float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
	float32x4_t mat0, mat1, mat2, mat3, mat4;
	float32x4_t res0, res1, res2, res3;

	vec80 = mat4 = vdupq_n_f32( 0.0f );
	// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
	#define LOAD_VEC_16( v0, v1, v2, v3 ) \
	#include "pch.h"
	#include "RenderDeviceGLImpl.hpp"
	#include "../../../../NetCore/ModeSet/API/eglContext.h"
	#include <EGL/egl.h>
	#include <EGL/eglext.h>
	#define GL_GLEXT_PROTOTYPES
	#include <GLES2/gl2.h>
	#include <GLES2/gl2ext.h>
	#include <libdrm/drm_fourcc.h>
	#include <string>
	#include <vector>
	#include <assert.h>
	#include <immintrin.h>

	struct data
	{
	std::vector<int8_t> byteVals; // byteVals[i] == -128 means look in intVals
	std::vector<int> intVals; // length is number of -128 values in byteVals

	void push_back( int v )
	using System;
	using System.Runtime.CompilerServices;
	using System.Runtime.Intrinsics;
	using System.Runtime.Intrinsics.Arm;

	static class MotionDetectNeon
	{
	/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary>
	[MethodImpl( MethodImplOptions.AggressiveInlining )]
	static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc )
	int NeonTest( const uint8_t* lhs, const uint8_t* rhs, size_t count )
	{
	// If the length is not multiple of 16, you gonna need more code to handle the remainder
	assert( 0 == ( count % 16 ) );

	const uint8_t* const lhsEnd = lhs + count;
	int32x4_t acc = vdupq_n_s32( 0 );

	// The threshold is power of 2, using bits test for comparison for v >= 16
	const uint8x16_t thresholdBitMask = vdupq_n_u8( 0xF0 );
	// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with truncation.
	// The function writes 20 bytes to the pointer.
	inline void store_10x16_avx2( __m256i v, uint8_t* rdi )
	{
	__m256i low, high;
	// Pack pairs of 10 bits into 20
	low = _mm256_slli_epi16( v, 6 );
	v = _mm256_blend_epi16( v, low, 0b01010101 );

	// Now the vector contains 32-bit lanes with 20 payload bits / each in the middle of them
	inline __m256i shiftLeftBytes_mem( const __m256i src, int i )
	{
	assert( i >= 0 && i < 32 );
	// Align by 64 bytes so the complete array stays in a single cache line
	alignas( 64 ) std::array<uint8_t, 64> buffer;
	// Store zeros at offset 0
	_mm256_store_si256( ( __m256i* )buffer.data(), _mm256_setzero_si256() );
	// Store the source vector at offset 32
	_mm256_store_si256( ( __m256i* )( buffer.data() + 32 ), src );
	// Load back with the offset
	// Store 10-bit pieces from each of the 16-bit lanes of the AVX2 vector.
	// The function writes 20 bytes to the pointer.
	inline void storeu_10x16( __m256i v, uint8_t* dest )
	{
	// Pack pairs of 10 bits into 20, in 32-bit lanes
	__m256i high = _mm256_srli_epi32( v, 16 - 10 );
	const __m256i low10 = _mm256_set1_epi32( ( 1 << 10 ) - 1 ); // Bitmask of 10 lowest bits in 32-bit lanes
	__m256i low = _mm256_and_si256( v, low10 );
	high = _mm256_andnot_si256( low10, high );
	v = _mm256_or_si256( low, high );