Fabian 'ryg' Giesen rygorous

## timings.txt
---- "Low" effort level (OodleTex_EncodeEffortLevel_Low = 10)

CompressBCN      : 5.258 millis
kodim01.bmp BC7-RGB: rmse=2.4325 hash=0x5dcf9106f8f4415d
CompressBCN      : 5.040 millis
kodim02.bmp BC7-RGB: rmse=2.1168 hash=0x79f45423cd9d3ec0
CompressBCN      : 5.165 millis
kodim03.bmp BC7-RGB: rmse=1.6793 hash=0x806dce71d1ff8293
CompressBCN      : 5.092 millis
kodim04.bmp BC7-RGB: rmse=2.1611 hash=0x681ebb3045e254ec

## advance_refill.cpp
    // advance
    for (int i = 0; i < num_streams; ++i)
    {
        std::string desc = formatf("advance %d", i);
        bool is_reverse_stream = (i % 3) == 1;
        if (EARLY_CLZ != 2)
        {
            if (EARLY_CLZ == 0)
                bb->append(CLZ(bits[i], bits[i]).set_comment(desc)); // figure out how many bits we consumed
            if (is_reverse_stream)

## bc7_single_color.cpp
void bc7_encode_single_color_block(U8 * output_bc7, const U8 rgba[4])
{
	U64 r = rgba[0];
	U64 g = rgba[1];
	U64 b = rgba[2];
	U64 a = rgba[3];

	const U64 bit6_mask = (0x40 << 8) | (0x40 << 22) | (0x40ull << 36);
	const U64 lo7_mask  = (0x7f << 8) | (0x7f << 22) | (0x7full << 36);
	U64 color_bits;

## gist:efc460d0154347ebe17bde7275070d9b
reading: c:\devel\media\bc1speedlevel/M_MED_Kurohomura_Backpack_Textures_T_Kurohomura_BP_backpack_D.uasset_build.mip0.png
CompressBC7      : 65.720 millis, 3.51 kc/B, rate= 997.20 kB/s
per-pixel rmse : 0.6425
mode stats (8=invalid 9=solid):
[0]     567 (  0.87%)
[1]    5465 (  8.34%)
[2]     126 (  0.19%)
[3]   15855 ( 24.19%)
[4]     909 (  1.39%)
[5]   15072 ( 23.00%)

## gist:2e467331d6f7766b4c22a02bd426b545
Cixin Liu, "The Three-Body Problem", Chapter 4 "The Frontriers of Science"

Page numbers given from trade paperback, ISBN 978 0-7653-8203-0

Pg. 53:

  Shi and the young cop said nothing. Both turned and went down the stairs. The two
  army officers watched them leave and seemed to sigh with relief.

  "What's wrong with that guy?" the major whispered to the other officer.

## c_dct_repro_block.glsl
#version 430 core

layout(local_size_x=1) in;

layout(binding=0) uniform isamplerBuffer coeff_data;

layout(binding=0, std430) restrict buffer debug_buf {
    uint debug[];
};

## nvgl_kmd_crash.cpp
#include <stdio.h>
#include <stdlib.h>
#include <GL/gl3w.h>
#include "util.h"
#include "radglx.h"

#define CHECK_RESULTS           // turn this off and we'll keep running until the driver crashes.
#define MIN_ALIGN       1       // setting this to 128 (or higher) fixes the bug (both incorrect results and crash)

typedef unsigned char U8;

## gist:5c8aad95ef36c9cab3c9f2d6cfeedd8d
mov  rax, [codewords]
pext rax, [masks] ; coalesce code words
shlx rax, rax, rDestBitPos ; still need something like this
or   rOut, rax ; and this
add  rDestBitPos, [total] ; this and make SIMD code emit it, or load masks to a reg then do popcnt on it?

; so 5-6 insns per 4 codewords = 1.25-1.5 insns per codeword
; leaves 2.5-2.75 insns per codeword to assemble codewords SIMD
; our best bet here is, I guess, 8 16-bit codewords at once (AVX2)
;

## simd_multigetbits.cpp
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <smmintrin.h>

#ifdef __RADAVX__
#include <immintrin.h>
#endif

## simple_multigetbits.cpp
// Returns 8 bit fields at the given positions (in bits) and of the
// given widths as 16-bit integers, with the values aligned with the
// MSB at the top and garbage in the lower-order bits.
//
// The individual lens must be <=8, the positions are bit offsets
// into the 128-bit "bytes".
template<
	int pos0, int len0,
	int pos1, int len1,
	int pos2, int len2,
	---- "Low" effort level (OodleTex_EncodeEffortLevel_Low = 10)

	CompressBCN : 5.258 millis
	kodim01.bmp BC7-RGB: rmse=2.4325 hash=0x5dcf9106f8f4415d
	CompressBCN : 5.040 millis
	kodim02.bmp BC7-RGB: rmse=2.1168 hash=0x79f45423cd9d3ec0
	CompressBCN : 5.165 millis
	kodim03.bmp BC7-RGB: rmse=1.6793 hash=0x806dce71d1ff8293
	CompressBCN : 5.092 millis
	kodim04.bmp BC7-RGB: rmse=2.1611 hash=0x681ebb3045e254ec
	// advance
	for (int i = 0; i < num_streams; ++i)
	{
	std::string desc = formatf("advance %d", i);
	bool is_reverse_stream = (i % 3) == 1;
	if (EARLY_CLZ != 2)
	{
	if (EARLY_CLZ == 0)
	bb->append(CLZ(bits[i], bits[i]).set_comment(desc)); // figure out how many bits we consumed
	if (is_reverse_stream)
	void bc7_encode_single_color_block(U8 * output_bc7, const U8 rgba[4])
	{
	U64 r = rgba[0];
	U64 g = rgba[1];
	U64 b = rgba[2];
	U64 a = rgba[3];

	const U64 bit6_mask = (0x40 << 8) \| (0x40 << 22) \| (0x40ull << 36);
	const U64 lo7_mask = (0x7f << 8) \| (0x7f << 22) \| (0x7full << 36);
	U64 color_bits;
	reading: c:\devel\media\bc1speedlevel/M_MED_Kurohomura_Backpack_Textures_T_Kurohomura_BP_backpack_D.uasset_build.mip0.png
	CompressBC7 : 65.720 millis, 3.51 kc/B, rate= 997.20 kB/s
	per-pixel rmse : 0.6425
	mode stats (8=invalid 9=solid):
	[0] 567 ( 0.87%)
	[1] 5465 ( 8.34%)
	[2] 126 ( 0.19%)
	[3] 15855 ( 24.19%)
	[4] 909 ( 1.39%)
	[5] 15072 ( 23.00%)
	Cixin Liu, "The Three-Body Problem", Chapter 4 "The Frontriers of Science"

	Page numbers given from trade paperback, ISBN 978 0-7653-8203-0

	Pg. 53:

	Shi and the young cop said nothing. Both turned and went down the stairs. The two
	army officers watched them leave and seemed to sigh with relief.

	"What's wrong with that guy?" the major whispered to the other officer.
	#version 430 core

	layout(local_size_x=1) in;

	layout(binding=0) uniform isamplerBuffer coeff_data;

	layout(binding=0, std430) restrict buffer debug_buf {
	uint debug[];
	};
	#include <stdio.h>
	#include <stdlib.h>
	#include <GL/gl3w.h>
	#include "util.h"
	#include "radglx.h"

	#define CHECK_RESULTS // turn this off and we'll keep running until the driver crashes.
	#define MIN_ALIGN 1 // setting this to 128 (or higher) fixes the bug (both incorrect results and crash)

	typedef unsigned char U8;
	mov rax, [codewords]
	pext rax, [masks] ; coalesce code words
	shlx rax, rax, rDestBitPos ; still need something like this
	or rOut, rax ; and this
	add rDestBitPos, [total] ; this and make SIMD code emit it, or load masks to a reg then do popcnt on it?

	; so 5-6 insns per 4 codewords = 1.25-1.5 insns per codeword
	; leaves 2.5-2.75 insns per codeword to assemble codewords SIMD
	; our best bet here is, I guess, 8 16-bit codewords at once (AVX2)
	;
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <smmintrin.h>

	#ifdef __RADAVX__
	#include <immintrin.h>
	#endif
	// Returns 8 bit fields at the given positions (in bits) and of the
	// given widths as 16-bit integers, with the values aligned with the
	// MSB at the top and garbage in the lower-order bits.
	//
	// The individual lens must be <=8, the positions are bit offsets
	// into the 128-bit "bytes".
	template<
	int pos0, int len0,
	int pos1, int len1,
	int pos2, int len2,