Skip to content

Instantly share code, notes, and snippets.

@catid
Last active June 19, 2018 04:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save catid/1e8e993e1581e92d96515a07c5ce4804 to your computer and use it in GitHub Desktop.
Save catid/1e8e993e1581e92d96515a07c5ce4804 to your computer and use it in GitHub Desktop.
Extra huf_compress heuristic to speed up already-compressed or random data
/* Scan input and build symbol stats */
{ CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
if (largest <= (srcSize >> 7) + 1) {
return 0; /* heuristic : probably not compressible enough */
}
/* Added this more accurate heuristic to handle high entropy data properly -catid */
U64 total = 0;
U64 sum_clogc = 0;
unsigned nonzero_count = 0;
for (unsigned i = 0; i < maxSymbolValue; ++i) {
U32 const c_i = table->count[i];
if (c_i == 0) {
continue;
}
total += c_i;
++nonzero_count;
unsigned const c_bits = BIT_highbit32(c_i) + 1;
sum_clogc += c_i * c_bits;
}
unsigned t_bits = 0;
U32 t_bits_input = (U32)total;
if (total >= 0x100000000ULL) {
t_bits = 32;
t_bits_input = (U32)(total >> 32);
}
t_bits += BIT_highbit32(t_bits_input) + 1;
unsigned estimate_bytes = (unsigned)((total * t_bits - sum_clogc) / 8);
if (nonzero_count > 64) {
estimate_bytes += nonzero_count / 4;
}
else {
estimate_bytes += nonzero_count / 2;
}
if (estimate_bytes > srcSize) {
return 0; /* heuristic : probably not compressible enough */
}
@catid
Copy link
Author

catid commented Jun 19, 2018

It goes in huf_compress.c around like 680 here:

/* Scan input and build symbol stats */
{  CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
    if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
    if (largest <= (srcSize >> 7) + 1) {
        return 0;   /* heuristic : probably not compressible enough */
    }

    /* Added this more accurate heuristic to handle high entropy data properly -catid */

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment