This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
#include <assert.h> | |
#include <float.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
#include <assert.h> | |
#include <float.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ | |
// Load 16 bytes from memory | |
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
// Compute product of width*16 column major matrix by vector of length `width`, | |
// the result is a vector of length 16 | |
// BTW, according to godbolt.org, gcc does better than clang for this code. | |
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi ) | |
{ | |
// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors | |
__m256 a00 = _mm256_setzero_ps(); | |
__m256 a01 = _mm256_setzero_ps(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
// Compute product of width*16 column major matrix by vector of length `width`, | |
// the result is a vector of length 16 | |
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi ) | |
{ | |
// Using 2 accumulators per row to workaround data dependency on the accumulators | |
// Initialize the accumulators | |
__m256 a00 = _mm256_setzero_ps(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Runtime.InteropServices; | |
using Whisper; | |
/// <summary>This class demonstrates how to implement iAudioBuffer COM interface in C#, to supply audio samples produced by managed code</summary> | |
/// <remarks>The library requires these samples to be <c>float</c> numbers @ 16 kHz sample rate</remarks> | |
sealed class AudioBuffer: iAudioBuffer | |
{ | |
void IDisposable.Dispose() | |
{ | |
free(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cbuffer CB_PROJ | |
{ | |
matrix camera; | |
}; | |
struct VOut | |
{ | |
float3 position : POSITION; | |
float3 r_s : NORMAL; | |
uint bits : BLENDINDICES; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static class Program | |
{ | |
// Make a random transaction between two people. | |
static void randomTransaction( ref int to, ref int from ) | |
{ | |
const int transactionAmount = 5; | |
int amount = Math.Min( transactionAmount, from ); | |
from -= amount; | |
to += amount; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__m128i convertMask( __m256d src ) | |
{ | |
// Bit-cast into fp32 vector, the intrinsic compiles into no instructions | |
const __m256 f32 = _mm256_castpd_ps( src ); | |
// Split into high/low halves; casting is free, vextractf128 is not. | |
const __m128 low = _mm256_castps256_ps128( f32 ); | |
const __m128 high = _mm256_extractf128_ps( f32, 1 ); | |
// Combine 32-bit values into a single vector with correct order | |
// _mm_shuffle_ps takes first 2 lanes from the first argument, last 2 lanes fro the second argument. | |
const __m128 combined = _mm_shuffle_ps( low, high, _MM_SHUFFLE( 2, 0, 2, 0 ) ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "pch.h" | |
#include "RenderDeviceGLImpl.hpp" | |
#include "../../../../NetCore/ModeSet/API/eglContext.h" | |
#include <EGL/egl.h> | |
#include <EGL/eglext.h> | |
#define GL_GLEXT_PROTOTYPES | |
#include <GLES2/gl2.h> | |
#include <GLES2/gl2ext.h> | |
#include <libdrm/drm_fourcc.h> | |
#include <string> |