Konstantin Const-me

## dotProduct_q40_f16.cpp
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>

// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{

## QuantisationTest.cpp
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>

// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{

## QuantisationTest.cpp
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>

// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
	// Load 16 bytes from memory
	__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );

## multiplyInner_avx16.cpp
#include <immintrin.h>

// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
// BTW, according to godbolt.org, gcc does better than clang for this code.
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
	// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors
	__m256 a00 = _mm256_setzero_ps();
	__m256 a01 = _mm256_setzero_ps();

## multiplyInner_avx16.cpp
#include <immintrin.h>

// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
	// Using 2 accumulators per row to workaround data dependency on the accumulators

	// Initialize the accumulators
	__m256 a00 = _mm256_setzero_ps();

## AudioBuffer.cs
using System.Runtime.InteropServices;
using Whisper;

/// <summary>This class demonstrates how to implement iAudioBuffer COM interface in C#, to supply audio samples produced by managed code</summary>
/// <remarks>The library requires these samples to be <c>float</c> numbers @ 16 kHz sample rate</remarks>
sealed class AudioBuffer: iAudioBuffer
{
	void IDisposable.Dispose()
	{
		free();

## OrientedQuadsGS.hlsl
cbuffer CB_PROJ
{
	matrix camera;
};

struct VOut
{
	float3 position : POSITION;
	float3 r_s : NORMAL;
	uint   bits : BLENDINDICES;

## WealthSim.cs
static class Program
{
	// Make a random transaction between two people.
	static void randomTransaction( ref int to, ref int from )
	{
		const int transactionAmount = 5;
		int amount = Math.Min( transactionAmount, from );
		from -= amount;
		to += amount;
	}

## convertMask.cpp
__m128i convertMask( __m256d src )
{
	// Bit-cast into fp32 vector, the intrinsic compiles into no instructions
	const __m256 f32 = _mm256_castpd_ps( src );
	// Split into high/low halves; casting is free, vextractf128 is not.
	const __m128 low = _mm256_castps256_ps128( f32 );
	const __m128 high = _mm256_extractf128_ps( f32, 1 );
	// Combine 32-bit values into a single vector with correct order
	// _mm_shuffle_ps takes first 2 lanes from the first argument, last 2 lanes fro the second argument.
	const __m128 combined = _mm_shuffle_ps( low, high, _MM_SHUFFLE( 2, 0, 2, 0 ) );

## importNv12Texture.cpp
#include "pch.h"
#include "RenderDeviceGLImpl.hpp"
#include "../../../../NetCore/ModeSet/API/eglContext.h"
#include <EGL/egl.h>
#include <EGL/eglext.h>
#define GL_GLEXT_PROTOTYPES
#include <GLES2/gl2.h>
#include <GLES2/gl2ext.h>
#include <libdrm/drm_fourcc.h>
#include <string>
	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>
	#include <assert.h>
	#include <float.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{
	#include <immintrin.h>

	// Compute product of width*16 column major matrix by vector of length `width`,
	// the result is a vector of length 16
	// BTW, according to godbolt.org, gcc does better than clang for this code.
	void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
	{
	// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors
	__m256 a00 = _mm256_setzero_ps();
	__m256 a01 = _mm256_setzero_ps();
	using System.Runtime.InteropServices;
	using Whisper;

	/// <summary>This class demonstrates how to implement iAudioBuffer COM interface in C#, to supply audio samples produced by managed code</summary>
	/// <remarks>The library requires these samples to be <c>float</c> numbers @ 16 kHz sample rate</remarks>
	sealed class AudioBuffer: iAudioBuffer
	{
	void IDisposable.Dispose()
	{
	free();
	cbuffer CB_PROJ
	{
	matrix camera;
	};

	struct VOut
	{
	float3 position : POSITION;
	float3 r_s : NORMAL;
	uint bits : BLENDINDICES;
	static class Program
	{
	// Make a random transaction between two people.
	static void randomTransaction( ref int to, ref int from )
	{
	const int transactionAmount = 5;
	int amount = Math.Min( transactionAmount, from );
	from -= amount;
	to += amount;
	}
	__m128i convertMask( __m256d src )
	{
	// Bit-cast into fp32 vector, the intrinsic compiles into no instructions
	const __m256 f32 = _mm256_castpd_ps( src );
	// Split into high/low halves; casting is free, vextractf128 is not.
	const __m128 low = _mm256_castps256_ps128( f32 );
	const __m128 high = _mm256_extractf128_ps( f32, 1 );
	// Combine 32-bit values into a single vector with correct order
	// _mm_shuffle_ps takes first 2 lanes from the first argument, last 2 lanes fro the second argument.
	const __m128 combined = _mm_shuffle_ps( low, high, _MM_SHUFFLE( 2, 0, 2, 0 ) );
	#include "pch.h"
	#include "RenderDeviceGLImpl.hpp"
	#include "../../../../NetCore/ModeSet/API/eglContext.h"
	#include <EGL/egl.h>
	#include <EGL/eglext.h>
	#define GL_GLEXT_PROTOTYPES
	#include <GLES2/gl2.h>
	#include <GLES2/gl2ext.h>
	#include <libdrm/drm_fourcc.h>
	#include <string>