Fabian 'ryg' Giesen rygorous

## results.txt
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga out_baseline.dds
test_framework 2.9.9 built Nov  8 2022 16:18:59 on Win-x64 msvc-1929
test_bc1
reading: c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga
auto alpha : RGBA with all Opaque : no alpha
CompressBCN      : 64.330 millis, 858.90 c/B, rate= 4.07 MB/s
writing: out_baseline.dds
t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga BC1: rmse=3.4901 hash=0x7b37fc442d36f112
vendor rmse: 3.4953(AMD), 3.4948(NV), 3.4929(Intel), 3.5382(D3D)
rmse_total = 3.490, combined hash=0xe31c1ffb4f7950d3

## results.txt
C:\devel\libs\icbc>cl /nologo icbc_test.cpp /O2 /arch:AVX2 && icbc_test -dec intel
icbc_test.cpp
Using 32 threads.
Encoding 'c:/devel/media/blops3bc1/t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga':  RMSE = 3.510    PSNR = 37.224   TIME = 0.052029 (0.052029)
Encoding 'c:/devel/media/blops3bc1/c_hro_sarah_armor_c_BC1_UNORM_sRGB.tga':     RMSE = 4.406    PSNR = 35.251   TIME = 0.012941 (0.012941)
Average Results:
        RMSE = 3.707    PSNR = 36.751   TIME = 0.064970 (0.064970)

C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga -e2
test_framework 2.9.9 built Nov  8 2022 16:18:59 on Win-x64 msvc-1929

## f2h.cpp
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>

// Float->half conversion with round-to-nearest-even, SSE2+
// leaves half-floats in 32-bit lanes (sign extended)
static inline __m128i F32_to_F16_4x(const __m128 &f)
{
	const __m128 mask_sign			= _mm_set1_ps(-0.0f);
	const __m128i c_f16max			= _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf

## gist:5c815b74e8428fe9ab1e75495c59a9ce
// Scope guard to set up FP state as desired and reset it on exit
struct FPStateScope
{
	U32 saved_state;

	FPStateScope();
	~FPStateScope();
};

// ...

## transpose.cpp
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab
struct KrakenMSBHuffTab
{
	U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset
	U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset
};

// NOTE: must match what the ASM inner loops expect (or disable them above)!
struct KrakenHuffElem
{

## rsqrtss_dump.cpp
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <emmintrin.h>

static inline uint32_t test_func(uint32_t x)
{
	__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x));
	__m128 value = _mm_rsqrt_ss(xs);
	uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value));

## evil.cpp
#include <stdio.h>
#include <float.h>
#include <math.h>
#include <algorithm>

int main()
{
    constexpr size_t count = 20;
    float vals[count] =
    {

## cheb.py
# Chebyshev evaluation with interval rescaled to [0,1]
xp = 2*x - 1 # remap to [-1,1] that Cheb basis is easier in

# Constant and linear terms
result = coeffs[0] + xp*coeffs[1]

# Recurrence for remaining terms
cur, prev = xp, 1
t = xp * 2

## bc4u_interp_to_float.py
# Computes the 32-bit IEEE float bit pattern for x/16320 (where x is given as an integer),
# with rounding slightly off from proper RN, matching observed behavior of AMD BC4_UNORM
# decoder HW
def hw_14bit_to_floatu_v2(x, trace=False):
    if x <= 0:
        return 0
    elif x >= 255 * 64:
        return 0x3f800000
    else:
        # 16320 = 255*64

## multigetbits.cpp
static inline __m128i prefix_sum_u8(__m128i x)
{
#if 1
	// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
	// for us)
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
	x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
#else
	C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga out_baseline.dds
	test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929
	test_bc1
	reading: c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga
	auto alpha : RGBA with all Opaque : no alpha
	CompressBCN : 64.330 millis, 858.90 c/B, rate= 4.07 MB/s
	writing: out_baseline.dds
	t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga BC1: rmse=3.4901 hash=0x7b37fc442d36f112
	vendor rmse: 3.4953(AMD), 3.4948(NV), 3.4929(Intel), 3.5382(D3D)
	rmse_total = 3.490, combined hash=0xe31c1ffb4f7950d3
	C:\devel\libs\icbc>cl /nologo icbc_test.cpp /O2 /arch:AVX2 && icbc_test -dec intel
	icbc_test.cpp
	Using 32 threads.
	Encoding 'c:/devel/media/blops3bc1/t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga': RMSE = 3.510 PSNR = 37.224 TIME = 0.052029 (0.052029)
	Encoding 'c:/devel/media/blops3bc1/c_hro_sarah_armor_c_BC1_UNORM_sRGB.tga': RMSE = 4.406 PSNR = 35.251 TIME = 0.012941 (0.012941)
	Average Results:
	RMSE = 3.707 PSNR = 36.751 TIME = 0.064970 (0.064970)

	C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga -e2
	test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929
	#include <stdio.h>
	#include <stdint.h>
	#include <immintrin.h>

	// Float->half conversion with round-to-nearest-even, SSE2+
	// leaves half-floats in 32-bit lanes (sign extended)
	static inline __m128i F32_to_F16_4x(const __m128 &f)
	{
	const __m128 mask_sign = _mm_set1_ps(-0.0f);
	const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf
	// Scope guard to set up FP state as desired and reset it on exit
	struct FPStateScope
	{
	U32 saved_state;

	FPStateScope();
	~FPStateScope();
	};

	// ...
	// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab
	struct KrakenMSBHuffTab
	{
	U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset
	U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset
	};

	// NOTE: must match what the ASM inner loops expect (or disable them above)!
	struct KrakenHuffElem
	{
	#include <stdio.h>
	#include <string.h>
	#include <stdint.h>
	#include <emmintrin.h>

	static inline uint32_t test_func(uint32_t x)
	{
	__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x));
	__m128 value = _mm_rsqrt_ss(xs);
	uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value));
	#include <stdio.h>
	#include <float.h>
	#include <math.h>
	#include <algorithm>

	int main()
	{
	constexpr size_t count = 20;
	float vals[count] =
	{
	# Chebyshev evaluation with interval rescaled to [0,1]
	xp = 2*x - 1 # remap to [-1,1] that Cheb basis is easier in

	# Constant and linear terms
	result = coeffs[0] + xp*coeffs[1]

	# Recurrence for remaining terms
	cur, prev = xp, 1
	t = xp * 2
	# Computes the 32-bit IEEE float bit pattern for x/16320 (where x is given as an integer),
	# with rounding slightly off from proper RN, matching observed behavior of AMD BC4_UNORM
	# decoder HW
	def hw_14bit_to_floatu_v2(x, trace=False):
	if x <= 0:
	return 0
	elif x >= 255 * 64:
	return 0x3f800000
	else:
	# 16320 = 255*64
	static inline __m128i prefix_sum_u8(__m128i x)
	{
	#if 1
	// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
	// for us)
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
	x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
	#else