Skip to content

Instantly share code, notes, and snippets.

@rygorous
rygorous / results.txt
Created November 9, 2022 09:56
Oodle Texture BC1 + RDO
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga out_baseline.dds
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929
test_bc1
reading: c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga
auto alpha : RGBA with all Opaque : no alpha
CompressBCN : 64.330 millis, 858.90 c/B, rate= 4.07 MB/s
writing: out_baseline.dds
t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga BC1: rmse=3.4901 hash=0x7b37fc442d36f112
vendor rmse: 3.4953(AMD), 3.4948(NV), 3.4929(Intel), 3.5382(D3D)
rmse_total = 3.490, combined hash=0xe31c1ffb4f7950d3
@rygorous
rygorous / results.txt
Created November 9, 2022 09:44
Oodle Texture BC1 encoder at "Normal" and "High" levels vs ICBC (ICBC RMSEs when decoding for Intel)
C:\devel\libs\icbc>cl /nologo icbc_test.cpp /O2 /arch:AVX2 && icbc_test -dec intel
icbc_test.cpp
Using 32 threads.
Encoding 'c:/devel/media/blops3bc1/t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga': RMSE = 3.510 PSNR = 37.224 TIME = 0.052029 (0.052029)
Encoding 'c:/devel/media/blops3bc1/c_hro_sarah_armor_c_BC1_UNORM_sRGB.tga': RMSE = 4.406 PSNR = 35.251 TIME = 0.012941 (0.012941)
Average Results:
RMSE = 3.707 PSNR = 36.751 TIME = 0.064970 (0.064970)
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga -e2
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929
@rygorous
rygorous / f2h.cpp
Created October 19, 2022 19:05
float<->half matching VCVTPS2PH exactly
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
// Float->half conversion with round-to-nearest-even, SSE2+
// leaves half-floats in 32-bit lanes (sign extended)
static inline __m128i F32_to_F16_4x(const __m128 &f)
{
const __m128 mask_sign = _mm_set1_ps(-0.0f);
const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf
// Scope guard to set up FP state as desired and reset it on exit
struct FPStateScope
{
U32 saved_state;
FPStateScope();
~FPStateScope();
};
// ...
@rygorous
rygorous / transpose.cpp
Created September 6, 2022 03:15
MSB-first -> LSB-first Huff table transpose (x86/SSE2 version)
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab
struct KrakenMSBHuffTab
{
U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset
U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset
};
// NOTE: must match what the ASM inner loops expect (or disable them above)!
struct KrakenHuffElem
{
@rygorous
rygorous / rsqrtss_dump.cpp
Created August 11, 2022 19:43
Intel RSQRTSS logic
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <emmintrin.h>
static inline uint32_t test_func(uint32_t x)
{
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x));
__m128 value = _mm_rsqrt_ss(xs);
uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value));
@rygorous
rygorous / evil.cpp
Last active April 25, 2022 20:55
Mwahaha
#include <stdio.h>
#include <float.h>
#include <math.h>
#include <algorithm>
int main()
{
constexpr size_t count = 20;
float vals[count] =
{
@rygorous
rygorous / cheb.py
Created November 30, 2021 23:34
Chebyshev evaluation pseudocode
# Chebyshev evaluation with interval rescaled to [0,1]
xp = 2*x - 1 # remap to [-1,1] that Cheb basis is easier in
# Constant and linear terms
result = coeffs[0] + xp*coeffs[1]
# Recurrence for remaining terms
cur, prev = xp, 1
t = xp * 2
@rygorous
rygorous / bc4u_interp_to_float.py
Last active September 27, 2021 09:40
BC4 interpolator results to float conversion attempt
# Computes the 32-bit IEEE float bit pattern for x/16320 (where x is given as an integer),
# with rounding slightly off from proper RN, matching observed behavior of AMD BC4_UNORM
# decoder HW
def hw_14bit_to_floatu_v2(x, trace=False):
if x <= 0:
return 0
elif x >= 255 * 64:
return 0x3f800000
else:
# 16320 = 255*64
@rygorous
rygorous / multigetbits.cpp
Created August 5, 2021 02:14
multigetbits
static inline __m128i prefix_sum_u8(__m128i x)
{
#if 1
// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
// for us)
x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
#else