Skip to content

Instantly share code, notes, and snippets.

// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// Load 16 bytes from memory
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
#include <immintrin.h>
// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
// BTW, according to godbolt.org, gcc does better than clang for this code.
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors
__m256 a00 = _mm256_setzero_ps();
__m256 a01 = _mm256_setzero_ps();
#include <immintrin.h>
// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
// Using 2 accumulators per row to workaround data dependency on the accumulators
// Initialize the accumulators
__m256 a00 = _mm256_setzero_ps();
using System.Runtime.InteropServices;
using Whisper;
/// <summary>This class demonstrates how to implement iAudioBuffer COM interface in C#, to supply audio samples produced by managed code</summary>
/// <remarks>The library requires these samples to be <c>float</c> numbers @ 16 kHz sample rate</remarks>
sealed class AudioBuffer: iAudioBuffer
{
void IDisposable.Dispose()
{
free();
cbuffer CB_PROJ
{
matrix camera;
};
struct VOut
{
float3 position : POSITION;
float3 r_s : NORMAL;
uint bits : BLENDINDICES;
static class Program
{
// Make a random transaction between two people.
static void randomTransaction( ref int to, ref int from )
{
const int transactionAmount = 5;
int amount = Math.Min( transactionAmount, from );
from -= amount;
to += amount;
}
__m128i convertMask( __m256d src )
{
// Bit-cast into fp32 vector, the intrinsic compiles into no instructions
const __m256 f32 = _mm256_castpd_ps( src );
// Split into high/low halves; casting is free, vextractf128 is not.
const __m128 low = _mm256_castps256_ps128( f32 );
const __m128 high = _mm256_extractf128_ps( f32, 1 );
// Combine 32-bit values into a single vector with correct order
// _mm_shuffle_ps takes first 2 lanes from the first argument, last 2 lanes fro the second argument.
const __m128 combined = _mm_shuffle_ps( low, high, _MM_SHUFFLE( 2, 0, 2, 0 ) );
#include "pch.h"
#include "RenderDeviceGLImpl.hpp"
#include "../../../../NetCore/ModeSet/API/eglContext.h"
#include <EGL/egl.h>
#include <EGL/eglext.h>
#define GL_GLEXT_PROTOTYPES
#include <GLES2/gl2.h>
#include <GLES2/gl2ext.h>
#include <libdrm/drm_fourcc.h>
#include <string>