Konstantin Const-me

## vector2Bisect.cpp
// Numerically stable 2D angle bisector algorithm
// (c) 2025 Konstantin http://const.me/
// This source file is subject to the terms of the MIT license https://opensource.org/license/MIT

#include <emmintrin.h> // SSE 1 and 2
#include <pmmintrin.h> // SSE 3
#include <smmintrin.h> // SSE 4.1

// Dot products of two pairs of 2D vectors in xy and zw of the inputs, broadcasted across both xy and zw
static inline __m128 dot2( __m128 a, __m128 b ) noexcept

## multiplyWithDxMath.asm
00007FF7837A1240  vmovups     xmm0,xmmword ptr [rdx]
00007FF7837A1244  vinsertf128 ymm9,ymm0,xmmword ptr [rdx+10h],1
00007FF7837A124B  vmovups     xmm1,xmmword ptr [rdx+20h]
00007FF7837A1250  vinsertf128 ymm10,ymm1,xmmword ptr [rdx+30h],1
00007FF7837A1257  vmovups     xmm0,xmmword ptr [rdx+40h]
00007FF7837A125C  vinsertf128 ymm3,ymm0,xmmword ptr [rdx+50h],1
00007FF7837A1263  vmovups     xmm1,xmmword ptr [rdx+60h]
00007FF7837A1268  vinsertf128 ymm5,ymm1,xmmword ptr [rdx+70h],1
00007FF7837A126F  vperm2f128  ymm2,ymm3,ymm3,0
00007FF7837A1275  vperm2f128  ymm7,ymm3,ymm3,11h

## EigenDX.cpp
// Compiled with VS 2022: Release AMD64, AVX2 ISA, LTCG
// RDTSC time on Ryzen 7 8700G for 1024 matrices: 15834 Eigen, 7224 DirectXMath
constexpr bool useEigen = true;

// Eigen 3.4.0
#include <Eigen/Eigen>
__forceinline void multiplyWithEigen( float* rdi, const float* rsi )
{
	using Mat = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
	static_assert( sizeof( Mat ) == 4 * 4 * sizeof( float ) );

## cosine_simd_omp.cpp
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <chrono>
#include <immintrin.h>
#include <assert.h>
using namespace std;
using namespace std::chrono;

## cosine_simd_opt.cpp
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <chrono>
#include <immintrin.h>
using namespace std;
using namespace std::chrono;

constexpr int SIZE = 640000;

## dot4x4.c
#include <stdint.h>
#include <emmintrin.h>	// SSE 2
#include <tmmintrin.h>	// SSSE 3
#include <smmintrin.h>	// SSE 4.1

// Vector constants for dot4Sse function
struct ConstantVectorsSse
{
	__m128i abcd;
	__m128i lowNibbleMask;

## move-fail.cpp
#include <stdio.h>
#include <vector>
#include <set>

static bool s_log = false;
void message( const char* what )
{
	if( s_log )
	{
		printf( "%s\n", what );

## StringHashMap.cpp
std::vector<std::string> someFunction( const Invocation& invocation )
{
	// Define hash and comparison for string pointers, by value
	struct StringPtrTraits
	{
		size_t operator()( const std::string* rsi ) const
		{
			return std::hash<std::string>()( *rsi );
		}
		bool operator()( const std::string* a, const std::string* b ) const

## varIntDecoder.cpp
#include <immintrin.h>
#include <stdint.h>

// 1  = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
// It seems on AMD CPUs scalar loads are slightly faster
#define USE_GATHER_INSTUCTIONS 0

// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
// When the sum of all bytes exceeds 0xFF, the output is garbage
// Which is fine here because our bytes are in [0..8] interval

## IncDecBench.cpp
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>

// Count of set bits in `plus` minus count of set bits in `minus`
// The result is in [ -32 .. +32 ] interval
inline int popCntDiff( uint32_t plus, uint32_t minus )
{
	plus = __popcnt( plus );
	// Numerically stable 2D angle bisector algorithm
	// (c) 2025 Konstantin http://const.me/
	// This source file is subject to the terms of the MIT license https://opensource.org/license/MIT

	#include <emmintrin.h> // SSE 1 and 2
	#include <pmmintrin.h> // SSE 3
	#include <smmintrin.h> // SSE 4.1

	// Dot products of two pairs of 2D vectors in xy and zw of the inputs, broadcasted across both xy and zw
	static inline __m128 dot2( __m128 a, __m128 b ) noexcept
	00007FF7837A1240 vmovups xmm0,xmmword ptr [rdx]
	00007FF7837A1244 vinsertf128 ymm9,ymm0,xmmword ptr [rdx+10h],1
	00007FF7837A124B vmovups xmm1,xmmword ptr [rdx+20h]
	00007FF7837A1250 vinsertf128 ymm10,ymm1,xmmword ptr [rdx+30h],1
	00007FF7837A1257 vmovups xmm0,xmmword ptr [rdx+40h]
	00007FF7837A125C vinsertf128 ymm3,ymm0,xmmword ptr [rdx+50h],1
	00007FF7837A1263 vmovups xmm1,xmmword ptr [rdx+60h]
	00007FF7837A1268 vinsertf128 ymm5,ymm1,xmmword ptr [rdx+70h],1
	00007FF7837A126F vperm2f128 ymm2,ymm3,ymm3,0
	00007FF7837A1275 vperm2f128 ymm7,ymm3,ymm3,11h
	// Compiled with VS 2022: Release AMD64, AVX2 ISA, LTCG
	// RDTSC time on Ryzen 7 8700G for 1024 matrices: 15834 Eigen, 7224 DirectXMath
	constexpr bool useEigen = true;

	// Eigen 3.4.0
	#include <Eigen/Eigen>
	__forceinline void multiplyWithEigen( float* rdi, const float* rsi )
	{
	using Mat = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
	static_assert( sizeof( Mat ) == 4 * 4 * sizeof( float ) );
	static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";

	#define _CRT_SECURE_NO_WARNINGS
	#include <iostream>
	#include <chrono>
	#include <immintrin.h>
	#include <assert.h>
	using namespace std;
	using namespace std::chrono;
	#include <stdint.h>
	#include <emmintrin.h> // SSE 2
	#include <tmmintrin.h> // SSSE 3
	#include <smmintrin.h> // SSE 4.1

	// Vector constants for dot4Sse function
	struct ConstantVectorsSse
	{
	__m128i abcd;
	__m128i lowNibbleMask;
	#include <stdio.h>
	#include <vector>
	#include <set>

	static bool s_log = false;
	void message( const char* what )
	{
	if( s_log )
	{
	printf( "%s\n", what );
	std::vector<std::string> someFunction( const Invocation& invocation )
	{
	// Define hash and comparison for string pointers, by value
	struct StringPtrTraits
	{
	size_t operator()( const std::string* rsi ) const
	{
	return std::hash<std::string>()( *rsi );
	}
	bool operator()( const std::string* a, const std::string* b ) const
	#include <immintrin.h>
	#include <stdint.h>

	// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
	// It seems on AMD CPUs scalar loads are slightly faster
	#define USE_GATHER_INSTUCTIONS 0

	// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
	// When the sum of all bytes exceeds 0xFF, the output is garbage
	// Which is fine here because our bytes are in [0..8] interval
	#include <stdint.h>
	#include <immintrin.h>
	#include <intrin.h>
	#include <stdio.h>

	// Count of set bits in `plus` minus count of set bits in `minus`
	// The result is in [ -32 .. +32 ] interval
	inline int popCntDiff( uint32_t plus, uint32_t minus )
	{
	plus = __popcnt( plus );