This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <atlfile.h> | |
#include <intrin.h> | |
#include <array> | |
// These headers are from there: https://github.com/Const-me/IntelIntrinsics/tree/master/CppDemo/Intrinsics | |
#include "Intrinsics/avx.hpp" | |
#include "Intrinsics/avx2.hpp" | |
#include "Intrinsics/sse.hpp" | |
#include "Intrinsics/sse2.hpp" | |
using namespace Intrinsics::Avx; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 ) | |
{ | |
// 30 vector registers in total; ARM64 has 32 of them, so we're good. | |
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80; | |
float32x4_t mat0, mat1, mat2, mat3, mat4; | |
float32x4_t res0, res1, res2, res3; | |
vec80 = mat4 = vdupq_n_f32( 0.0f ); | |
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer | |
#define LOAD_VEC_16( v0, v1, v2, v3 ) \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "pch.h" | |
#include "RenderDeviceGLImpl.hpp" | |
#include "../../../../NetCore/ModeSet/API/eglContext.h" | |
#include <EGL/egl.h> | |
#include <EGL/eglext.h> | |
#define GL_GLEXT_PROTOTYPES | |
#include <GLES2/gl2.h> | |
#include <GLES2/gl2ext.h> | |
#include <libdrm/drm_fourcc.h> | |
#include <string> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static class C | |
{ | |
static int div2(int i) | |
{ | |
return i / 2; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <assert.h> | |
#include <immintrin.h> | |
struct data | |
{ | |
std::vector<int8_t> byteVals; // byteVals[i] == -128 means look in intVals | |
std::vector<int> intVals; // length is number of -128 values in byteVals | |
void push_back( int v ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.Arm; | |
static class MotionDetectNeon | |
{ | |
/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary> | |
[MethodImpl( MethodImplOptions.AggressiveInlining )] | |
static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int NeonTest( const uint8_t* lhs, const uint8_t* rhs, size_t count ) | |
{ | |
// If the length is not multiple of 16, you gonna need more code to handle the remainder | |
assert( 0 == ( count % 16 ) ); | |
const uint8_t* const lhsEnd = lhs + count; | |
int32x4_t acc = vdupq_n_s32( 0 ); | |
// The threshold is power of 2, using bits test for comparison for v >= 16 | |
const uint8x16_t thresholdBitMask = vdupq_n_u8( 0xF0 ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with truncation. | |
// The function writes 20 bytes to the pointer. | |
inline void store_10x16_avx2( __m256i v, uint8_t* rdi ) | |
{ | |
__m256i low, high; | |
// Pack pairs of 10 bits into 20 | |
low = _mm256_slli_epi16( v, 6 ); | |
v = _mm256_blend_epi16( v, low, 0b01010101 ); | |
// Now the vector contains 32-bit lanes with 20 payload bits / each in the middle of them |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inline __m256i shiftLeftBytes_mem( const __m256i src, int i ) | |
{ | |
assert( i >= 0 && i < 32 ); | |
// Align by 64 bytes so the complete array stays in a single cache line | |
alignas( 64 ) std::array<uint8_t, 64> buffer; | |
// Store zeros at offset 0 | |
_mm256_store_si256( ( __m256i* )buffer.data(), _mm256_setzero_si256() ); | |
// Store the source vector at offset 32 | |
_mm256_store_si256( ( __m256i* )( buffer.data() + 32 ), src ); | |
// Load back with the offset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Store 10-bit pieces from each of the 16-bit lanes of the AVX2 vector. | |
// The function writes 20 bytes to the pointer. | |
inline void storeu_10x16( __m256i v, uint8_t* dest ) | |
{ | |
// Pack pairs of 10 bits into 20, in 32-bit lanes | |
__m256i high = _mm256_srli_epi32( v, 16 - 10 ); | |
const __m256i low10 = _mm256_set1_epi32( ( 1 << 10 ) - 1 ); // Bitmask of 10 lowest bits in 32-bit lanes | |
__m256i low = _mm256_and_si256( v, low10 ); | |
high = _mm256_andnot_si256( low10, high ); | |
v = _mm256_or_si256( low, high ); |