Ignacio Castaño castano

## qsort.inl
template <typename T>
struct Compare {
    T lambda;

#if _MSC_VER || __APPLE__
    static int compare(void * cmp, const void * a, const void * b)
#else
    static int compare(const void * a, const void * b, void * cmp)
#endif
    {

## Sphere.cpp
// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>

#include "Sphere.h"
#include "Vector.inl"
#include "Box.inl"

#include <float.h> // FLT_MAX

const float radiusEpsilon = 1e-4f;

## gather-emulation.cpp
// Emulating gathers using loads and permutevar8. This made the entire compressor about 15% faster. Both methods require AVX2.

// Load 4 uint8 per lane.
__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);

if (count <= 8) {
    // Load r_sat in one register:
    Wide8 r07 = load8(r_sat);
    Wide8 g07 = load8(g_sat);
    Wide8 b07 = load8(b_sat);

## avx-first-try.txt
    Wide8 besterror8 = broadcast8(FLT_MAX);
    Vector3_Wide8 beststart8;
    Vector3_Wide8 bestend8;

    // check all possible clusters for this total order
    for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8)
    {
        //uint c0 = s_fourCluster[i].c0;
        //uint c1 = s_fourCluster[i].c1;
        //uint c2 = s_fourCluster[i].c2;

## bc1-results-2020-04-16.txt
rgbcx v1.12
icbc v1.0
                 RMSE           PSNR            Time

Kodak/Waterloo Image Set:

rgbcx-0          8.128142       29.930977       0.163412
rgbcx-1          8.041399       30.024172       0.182219
rgbcx-2          8.198924       29.855667       0.213994
rgbcx-3          8.005721       30.062794       0.242393

## perfect-quantization-dxt-endpoints.txt
Perfect Quantization of DXT endpoints
-------------------------------------

One of the issues that affect the quality of most DXT compressors is the way floating point colors are rounded.

For example, stb_dxt does:

    max16 =  (unsigned short)(stb__sclamp((At1_r*yy - At2_r*xy)*frb+0.5f,0,31) << 11);
    max16 |= (unsigned short)(stb__sclamp((At1_g*yy - At2_g*xy)*fg +0.5f,0,63) << 5);
    max16 |= (unsigned short)(stb__sclamp((At1_b*yy - At2_b*xy)*frb+0.5f,0,31) << 0);

## bc1-results.txt
                 RMSE           PSNR            Time

Kodak/Waterloo Image Set:

stb              8.202766       29.851597       0.258041
stb-hq           8.009301       30.058910       0.284019
nvtt-fast        8.089954       29.971882       0.445670

nvtt             7.616215       30.496019       6.806233
nvtt-hq          7.562366       30.557650       13.081200

## Sphere.cpp
// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>

#include "Sphere.h"
#include "Vector.inl"
#include "Box.inl"

#include <float.h> // FLT_MAX

using namespace nv;

## gist:b05863af48138dab368b23a0ab86e735
I didn't think there was much to gain from optimizing quantization intervals for vertex compression, but I thought it would give it a try and see it for myself.

I have a set of vertices X that I want to quantize. The trivial way to do that is to transform them to the [0,1] range and store them using an integer UNORM format.

At runtime the vertices are reconstructed as follows:

X = Q * m + a

Where:

## hemicube.cpp

#include "hemicube.h"

#define PACK_HEMICUBES 1


static void get_hemicube_face_normal(int index, Vector3 *forward, Vector3 *left, Vector3 *up) {
    // Unwrapped hemicube with positive-Z in the middle.
    switch (index) {
        case 0: *forward = Vector3(+1,  0,  0); *left = Vector3( 0,  1,  0); break;
	template <typename T>
	struct Compare {
	T lambda;

	#if _MSC_VER \|\| __APPLE__
	static int compare(void * cmp, const void * a, const void * b)
	#else
	static int compare(const void * a, const void * b, void * cmp)
	#endif
	{
	// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>

	#include "Sphere.h"
	#include "Vector.inl"
	#include "Box.inl"

	#include <float.h> // FLT_MAX

	const float radiusEpsilon = 1e-4f;
	// Emulating gathers using loads and permutevar8. This made the entire compressor about 15% faster. Both methods require AVX2.

	// Load 4 uint8 per lane.
	__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]);

	if (count <= 8) {
	// Load r_sat in one register:
	Wide8 r07 = load8(r_sat);
	Wide8 g07 = load8(g_sat);
	Wide8 b07 = load8(b_sat);
	Wide8 besterror8 = broadcast8(FLT_MAX);
	Vector3_Wide8 beststart8;
	Vector3_Wide8 bestend8;

	// check all possible clusters for this total order
	for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8)
	{
	//uint c0 = s_fourCluster[i].c0;
	//uint c1 = s_fourCluster[i].c1;
	//uint c2 = s_fourCluster[i].c2;
	rgbcx v1.12
	icbc v1.0
	RMSE PSNR Time

	Kodak/Waterloo Image Set:

	rgbcx-0 8.128142 29.930977 0.163412
	rgbcx-1 8.041399 30.024172 0.182219
	rgbcx-2 8.198924 29.855667 0.213994
	rgbcx-3 8.005721 30.062794 0.242393
	Perfect Quantization of DXT endpoints
	-------------------------------------

	One of the issues that affect the quality of most DXT compressors is the way floating point colors are rounded.

	For example, stb_dxt does:

	max16 = (unsigned short)(stb__sclamp((At1_ryy - At2_rxy)*frb+0.5f,0,31) << 11);
	max16 \|= (unsigned short)(stb__sclamp((At1_gyy - At2_gxy)*fg +0.5f,0,63) << 5);
	max16 \|= (unsigned short)(stb__sclamp((At1_byy - At2_bxy)*frb+0.5f,0,31) << 0);
	RMSE PSNR Time

	Kodak/Waterloo Image Set:

	stb 8.202766 29.851597 0.258041
	stb-hq 8.009301 30.058910 0.284019
	nvtt-fast 8.089954 29.971882 0.445670

	nvtt 7.616215 30.496019 6.806233
	nvtt-hq 7.562366 30.557650 13.081200
	I didn't think there was much to gain from optimizing quantization intervals for vertex compression, but I thought it would give it a try and see it for myself.

	I have a set of vertices X that I want to quantize. The trivial way to do that is to transform them to the [0,1] range and store them using an integer UNORM format.

	At runtime the vertices are reconstructed as follows:

	X = Q * m + a

	Where:

	#include "hemicube.h"

	#define PACK_HEMICUBES 1


	static void get_hemicube_face_normal(int index, Vector3 forward, Vector3 left, Vector3 *up) {
	// Unwrapped hemicube with positive-Z in the middle.
	switch (index) {
	case 0: forward = Vector3(+1, 0, 0); left = Vector3( 0, 1, 0); break;