Skip to content

Instantly share code, notes, and snippets.

@nigeltao
Created August 25, 2017 02:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nigeltao/2884d66f16fd3109184967a78cfea7a5 to your computer and use it in GitHub Desktop.
Save nigeltao/2884d66f16fd3109184967a78cfea7a5 to your computer and use it in GitHub Desktop.
zlib: diff -u inffast.c contrib/inffast64/inffast64.c
--- inffast.c 2017-08-25 12:05:02.309161736 +1000
+++ contrib/inffast64/inffast64.c 2017-08-25 12:09:05.504252162 +1000
@@ -11,8 +11,11 @@
#ifdef ASMINF
# pragma message("Assembler code may have bugs -- use at your own risk")
#elif INFLATE_FAST64
-/* Skip this implementation; use contrib/inffast64/inffast64.c instead. */
-#else
+
+#if defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS) || \
+ defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN)
+#include <stdint.h>
+#endif
/*
Decode literal, length, and distance codes and write out the resulting
@@ -45,10 +48,28 @@
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
+ - On some architectures, it can be significantly faster (e.g. up to 1.2x
+ faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
+ time, so INFLATE_FAST_MIN_HAVE == 8. This requires a little endian
+ architecture.
+
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 258 for each loop to avoid checking for
output space.
+
+ - On some architectures, for length-distance copies, we similarly load and
+ store 8 bytes at a time, if the distance is at least 8. Again, this can
+ be significantly faster (e.g. up to 1.3x faster on x86_64). Rounding up
+ to a multiple of 8 gives INFLATE_FAST_MIN_LEFT == 264. This does not
+ require a little endian architecture. This always copies 8*L bytes (where
+ L is the smallest integer such that 8*L >= len, i.e. we round length up
+ to a multiple of 8), instead of only len bytes, but that's OK, as
+ subsequent iterations will fix the overrun.
+
+ - Combining those two optimizations for 64 bit unaligned loads gives up to
+ a 1.5x throughput improvement on x86_64.
+
*/
void ZLIB_INTERNAL inflate_fast(strm, start)
z_streamp strm;
@@ -67,7 +88,54 @@
unsigned whave; /* valid bytes in the window */
unsigned wnext; /* window write index */
unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
- unsigned long hold; /* local strm->hold */
+
+ /* hold is a local copy of strm->hold. By default, hold satisfies the same
+ invariants that strm->hold does, namely that (hold >> bits) == 0. This
+ invariant is kept by loading bits into hold one byte at a time, like:
+
+ hold |= next_byte_of_input << bits; in++; bits += 8;
+
+ If we need to ensure that bits >= 15 then this code snippet is simply
+ repeated. Over one iteration of the outermost do/while loop, this
+ happens up to six times (48 bits of input), as described in the NOTES
+ above.
+
+ However, on some little endian architectures, it can be significantly
+ faster to load 64 bits once instead of 8 bits six times:
+
+ if (bits <= 16) {
+ hold |= next_8_bytes_of_input << bits; in += 6; bits += 48;
+ }
+
+ Unlike the simpler one byte load, shifting the next_8_bytes_of_input
+ by bits will overflow and lose those high bits, up to 2 bytes' worth.
+ The conservative estimate is therefore that we have read only 6 bytes
+ (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
+ rest of the iteration, and we will not need to load another 8 bytes.
+
+ Inside this function, we no longer satisfy (hold >> bits) == 0, but
+ this is not problematic, even if that overflow does not land on an 8 bit
+ byte boundary. Those excess bits will eventually shift down lower as the
+ Huffman decoder consumes input, and when new input bits need to be loaded
+ into the bits variable, the same input bits will be or'ed over those
+ existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b).
+ Note that we therefore write that load operation as "hold |= etc" and not
+ "hold += etc".
+
+ Outside that loop, at the end of the function, hold is bitwise and'ed
+ with (1<<bits)-1 to drop those excess bits so that, on function exit, we
+ keep the invariant that (state->hold >> state->bits) == 0.
+
+ TODO: rename the bits variable to nbits, so that this block comment
+ is less confusing when discussing bits (the variable) and bits (one
+ eighth of a byte).
+ */
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+ uint64_t hold;
+#else
+ unsigned long hold;
+#endif
+
unsigned bits; /* local strm->bits */
code const FAR *lcode; /* local strm->lencode */
code const FAR *dcode; /* local strm->distcode */
@@ -105,10 +173,41 @@
input data or output space */
do {
if (bits < 15) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+ /* Example disassembly on __x86_64__:
+ 49 8b 06 mov (%r14),%rax
+ 89 f1 mov %esi,%ecx
+ 49 83 c6 06 add $0x6,%r14
+ 83 c6 30 add $0x30,%esi
+ 48 d3 e0 shl %cl,%rax
+ 48 09 c2 or %rax,%rdx
+ */
+ hold |= *((uint64_t *)(in)) << bits;
+ in += 6;
+ bits += 48;
+#else
+ /* Example disassembly on __x86_64__:
+ 45 0f b6 45 00 movzbl 0x0(%r13),%r8d
+ 89 f1 mov %esi,%ecx
+ 49 83 c5 02 add $0x2,%r13
+ 49 d3 e0 shl %cl,%r8
+ 8d 4e 08 lea 0x8(%rsi),%ecx
+ 83 c6 10 add $0x10,%esi
+ 49 01 d0 add %rdx,%r8
+ 41 0f b6 55 ff movzbl -0x1(%r13),%edx
+ 48 d3 e2 shl %cl,%rdx
+ 4c 01 c2 add %r8,%rdx
+ */
+ /* TODO: replace "hold += etc" with "hold |= etc", here and below,
+ to be consistent with the 64 bit unaligned code path. This is
+ only a comment for now so that the commit that introduced this
+ comment has no effect whatsoever on architectures without
+ ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN. */
hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
+#endif
}
here = lcode[hold & lmask];
dolen:
@@ -127,8 +226,14 @@
op &= 15; /* number of extra bits */
if (op) {
if (bits < op) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+ hold |= *((uint64_t *)(in)) << bits;
+ in += 6;
+ bits += 48;
+#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
+#endif
}
len += (unsigned)hold & ((1U << op) - 1);
hold >>= op;
@@ -136,10 +241,16 @@
}
Tracevv((stderr, "inflate: length %u\n", len));
if (bits < 15) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+ hold |= *((uint64_t *)(in)) << bits;
+ in += 6;
+ bits += 48;
+#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
+#endif
}
here = dcode[hold & dmask];
dodist:
@@ -151,12 +262,18 @@
dist = (unsigned)(here.val);
op &= 15; /* number of extra bits */
if (bits < op) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+ hold |= *((uint64_t *)(in)) << bits;
+ in += 6;
+ bits += 48;
+#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
+#endif
}
dist += (unsigned)hold & ((1U << op) - 1);
#ifdef INFLATE_STRICT
@@ -239,6 +356,22 @@
from = out - dist; /* rest from output */
}
}
+
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
+ if (dist >= 8) {
+ while (1) {
+ *((uint64_t*)(out)) = *((uint64_t*)(from));
+ if (len <= 8) {
+ out += len;
+ break;
+ }
+ out += 8;
+ from += 8;
+ len -= 8;
+ }
+ continue;
+ }
+#endif
while (len > 2) {
*out++ = *from++;
*out++ = *from++;
@@ -253,6 +386,21 @@
}
else {
from = out - dist; /* copy direct from output */
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
+ if (dist >= 8) {
+ while (1) {
+ *((uint64_t*)(out)) = *((uint64_t*)(from));
+ if (len <= 8) {
+ out += len;
+ break;
+ }
+ out += 8;
+ from += 8;
+ len -= 8;
+ }
+ continue;
+ }
+#endif
do { /* minimum length is three */
*out++ = *from++;
*out++ = *from++;
@@ -326,4 +474,4 @@
- Moving len -= 3 statement into middle of loop
*/
-#endif /* !ASMINF && !INFLATE_FAST64 */
+#endif /* !ASMINF && INFLATE_FAST64 */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment