This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga out_baseline.dds | |
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929 | |
test_bc1 | |
reading: c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga | |
auto alpha : RGBA with all Opaque : no alpha | |
CompressBCN : 64.330 millis, 858.90 c/B, rate= 4.07 MB/s | |
writing: out_baseline.dds | |
t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga BC1: rmse=3.4901 hash=0x7b37fc442d36f112 | |
vendor rmse: 3.4953(AMD), 3.4948(NV), 3.4929(Intel), 3.5382(D3D) | |
rmse_total = 3.490, combined hash=0xe31c1ffb4f7950d3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
C:\devel\libs\icbc>cl /nologo icbc_test.cpp /O2 /arch:AVX2 && icbc_test -dec intel | |
icbc_test.cpp | |
Using 32 threads. | |
Encoding 'c:/devel/media/blops3bc1/t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga': RMSE = 3.510 PSNR = 37.224 TIME = 0.052029 (0.052029) | |
Encoding 'c:/devel/media/blops3bc1/c_hro_sarah_armor_c_BC1_UNORM_sRGB.tga': RMSE = 4.406 PSNR = 35.251 TIME = 0.012941 (0.012941) | |
Average Results: | |
RMSE = 3.707 PSNR = 36.751 TIME = 0.064970 (0.064970) | |
C:\devel\libs\icbc>\devel\projects\oodle2\cdepbuild\win64_release\textest bc1 c:\devel\media\blops3bc1\t7_decal_grunge_dust_wall_01_c_BC1_UNORM_sRGB.tga -e2 | |
test_framework 2.9.9 built Nov 8 2022 16:18:59 on Win-x64 msvc-1929 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <immintrin.h> | |
// Float->half conversion with round-to-nearest-even, SSE2+ | |
// leaves half-floats in 32-bit lanes (sign extended) | |
static inline __m128i F32_to_F16_4x(const __m128 &f) | |
{ | |
const __m128 mask_sign = _mm_set1_ps(-0.0f); | |
const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Scope guard to set up FP state as desired and reset it on exit | |
struct FPStateScope | |
{ | |
U32 saved_state; | |
FPStateScope(); | |
~FPStateScope(); | |
}; | |
// ... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Un-bit-reversed huff table in MSB-first decode order, used in building the real KrakenHuffTab | |
struct KrakenMSBHuffTab | |
{ | |
U8 len[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // code lens; +16 for sloppy memset | |
U8 sym[NEWLZ_HUFF_DECODE_TABLE_SIZE + 16]; // sym id; +16 for sloppy memset | |
}; | |
// NOTE: must match what the ASM inner loops expect (or disable them above)! | |
struct KrakenHuffElem | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdint.h> | |
#include <emmintrin.h> | |
static inline uint32_t test_func(uint32_t x) | |
{ | |
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x)); | |
__m128 value = _mm_rsqrt_ss(xs); | |
uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <float.h> | |
#include <math.h> | |
#include <algorithm> | |
int main() | |
{ | |
constexpr size_t count = 20; | |
float vals[count] = | |
{ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Chebyshev evaluation with interval rescaled to [0,1] | |
xp = 2*x - 1 # remap to [-1,1] that Cheb basis is easier in | |
# Constant and linear terms | |
result = coeffs[0] + xp*coeffs[1] | |
# Recurrence for remaining terms | |
cur, prev = xp, 1 | |
t = xp * 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Computes the 32-bit IEEE float bit pattern for x/16320 (where x is given as an integer), | |
# with rounding slightly off from proper RN, matching observed behavior of AMD BC4_UNORM | |
# decoder HW | |
def hw_14bit_to_floatu_v2(x, trace=False): | |
if x <= 0: | |
return 0 | |
elif x >= 255 * 64: | |
return 0x3f800000 | |
else: | |
# 16320 = 255*64 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static inline __m128i prefix_sum_u8(__m128i x) | |
{ | |
#if 1 | |
// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck | |
// for us) | |
x = _mm_add_epi8(x, _mm_slli_epi64(x, 8)); | |
x = _mm_add_epi8(x, _mm_slli_epi64(x, 16)); | |
x = _mm_add_epi8(x, _mm_slli_epi64(x, 32)); | |
x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7))); | |
#else |