Konstantin Const-me

## dot4x4.c
#include <stdint.h>
#include <emmintrin.h>	// SSE 2
#include <tmmintrin.h>	// SSSE 3
#include <smmintrin.h>	// SSE 4.1

// Vector constants for dot4Sse function
struct ConstantVectorsSse
{
	__m128i abcd;
	__m128i lowNibbleMask;

## move-fail.cpp
#include <stdio.h>
#include <vector>
#include <set>

static bool s_log = false;
void message( const char* what )
{
	if( s_log )
	{
		printf( "%s\n", what );

## StringHashMap.cpp
std::vector<std::string> someFunction( const Invocation& invocation )
{
	// Define hash and comparison for string pointers, by value
	struct StringPtrTraits
	{
		size_t operator()( const std::string* rsi ) const
		{
			return std::hash<std::string>()( *rsi );
		}
		bool operator()( const std::string* a, const std::string* b ) const

## varIntDecoder.cpp
#include <immintrin.h>
#include <stdint.h>

// 1  = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
// It seems on AMD CPUs scalar loads are slightly faster
#define USE_GATHER_INSTUCTIONS 0

// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
// When the sum of all bytes exceeds 0xFF, the output is garbage
// Which is fine here because our bytes are in [0..8] interval

## IncDecBench.cpp
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>

// Count of set bits in `plus` minus count of set bits in `minus`
// The result is in [ -32 .. +32 ] interval
inline int popCntDiff( uint32_t plus, uint32_t minus )
{
	plus = __popcnt( plus );

## bitGrid-avx2.cpp
// Transform 4 inputs with 4 lookup tables, making 4 outputs
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
// The 4 tables are in a single AVX2 vector
uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
{
	// Move 4 bytes into SSE vector
	__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
	// Expand bytes into uint64_t lanes
	__m256i v = _mm256_cvtepu8_epi64( bytes );
	// Multiply them by 4 to get shift amounts in bits

## TwoSum.cpp
#include <stdlib.h>
#include <stdio.h>
#include <random>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <optional>
#include <intrin.h>
#include <inttypes.h>

## ReflectTest.cs
using System.Linq.Expressions;
using System.Reflection;
using System.Runtime.CompilerServices;

static class ReflectTest
{
	/// <summary>Generic method to call</summary>
	public static T GetValue<T>( T value )
	{
		return value;

## BranchTest.cpp
#include <stdlib.h>
#include <vector>
#include <intrin.h>
#include <stdint.h>
#include <inttypes.h>

std::vector<char> makeTestVector( bool random )
{
	std::vector<char> result;
	result.resize( 1024 * 16 );

## MatMulTest.cpp
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>

// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
	#include <stdint.h>
	#include <emmintrin.h> // SSE 2
	#include <tmmintrin.h> // SSSE 3
	#include <smmintrin.h> // SSE 4.1

	// Vector constants for dot4Sse function
	struct ConstantVectorsSse
	{
	__m128i abcd;
	__m128i lowNibbleMask;
	#include <stdio.h>
	#include <vector>
	#include <set>

	static bool s_log = false;
	void message( const char* what )
	{
	if( s_log )
	{
	printf( "%s\n", what );
	std::vector<std::string> someFunction( const Invocation& invocation )
	{
	// Define hash and comparison for string pointers, by value
	struct StringPtrTraits
	{
	size_t operator()( const std::string* rsi ) const
	{
	return std::hash<std::string>()( *rsi );
	}
	bool operator()( const std::string* a, const std::string* b ) const
	#include <immintrin.h>
	#include <stdint.h>

	// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
	// It seems on AMD CPUs scalar loads are slightly faster
	#define USE_GATHER_INSTUCTIONS 0

	// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
	// When the sum of all bytes exceeds 0xFF, the output is garbage
	// Which is fine here because our bytes are in [0..8] interval
	#include <stdint.h>
	#include <immintrin.h>
	#include <intrin.h>
	#include <stdio.h>

	// Count of set bits in `plus` minus count of set bits in `minus`
	// The result is in [ -32 .. +32 ] interval
	inline int popCntDiff( uint32_t plus, uint32_t minus )
	{
	plus = __popcnt( plus );
	// Transform 4 inputs with 4 lookup tables, making 4 outputs
	// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
	// The 4 tables are in a single AVX2 vector
	uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
	{
	// Move 4 bytes into SSE vector
	__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
	// Expand bytes into uint64_t lanes
	__m256i v = _mm256_cvtepu8_epi64( bytes );
	// Multiply them by 4 to get shift amounts in bits
	#include <stdlib.h>
	#include <stdio.h>
	#include <random>
	#include <vector>
	#include <unordered_map>
	#include <algorithm>
	#include <optional>
	#include <intrin.h>
	#include <inttypes.h>
	using System.Linq.Expressions;
	using System.Reflection;
	using System.Runtime.CompilerServices;

	static class ReflectTest
	{
	/// <summary>Generic method to call</summary>
	public static T GetValue<T>( T value )
	{
	return value;
	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>
	#include <assert.h>
	#include <float.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{