nigeltao/inffast64.diff

## inffast64.diff
--- inffast.c	2017-08-25 12:05:02.309161736 +1000
+++ contrib/inffast64/inffast64.c	2017-08-25 12:09:05.504252162 +1000
@@ -11,8 +11,11 @@
 #ifdef ASMINF
 #  pragma message("Assembler code may have bugs -- use at your own risk")
 #elif INFLATE_FAST64
-/* Skip this implementation; use contrib/inffast64/inffast64.c instead. */
-#else
+
+#if defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS) || \
+    defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN)
+#include <stdint.h>
+#endif

 /*
    Decode literal, length, and distance codes and write out the resulting
@@ -45,10 +48,28 @@
       Therefore if strm->avail_in >= 6, then there is enough input to avoid
       checking for available input while decoding.

+    - On some architectures, it can be significantly faster (e.g. up to 1.2x
+      faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
+      time, so INFLATE_FAST_MIN_HAVE == 8. This requires a little endian
+      architecture.
+
     - The maximum bytes that a single length/distance pair can output is 258
       bytes, which is the maximum length that can be coded.  inflate_fast()
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
+
+    - On some architectures, for length-distance copies, we similarly load and
+      store 8 bytes at a time, if the distance is at least 8. Again, this can
+      be significantly faster (e.g. up to 1.3x faster on x86_64). Rounding up
+      to a multiple of 8 gives INFLATE_FAST_MIN_LEFT == 264. This does not
+      require a little endian architecture. This always copies 8*L bytes (where
+      L is the smallest integer such that 8*L >= len, i.e. we round length up
+      to a multiple of 8), instead of only len bytes, but that's OK, as
+      subsequent iterations will fix the overrun.
+
+    - Combining those two optimizations for 64 bit unaligned loads gives up to
+      a 1.5x throughput improvement on x86_64.
+
  */
 void ZLIB_INTERNAL inflate_fast(strm, start)
 z_streamp strm;
@@ -67,7 +88,54 @@
     unsigned whave;             /* valid bytes in the window */
     unsigned wnext;             /* window write index */
     unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
-    unsigned long hold;         /* local strm->hold */
+
+    /* hold is a local copy of strm->hold. By default, hold satisfies the same
+       invariants that strm->hold does, namely that (hold >> bits) == 0. This
+       invariant is kept by loading bits into hold one byte at a time, like:
+
+       hold |= next_byte_of_input << bits; in++; bits += 8;
+
+       If we need to ensure that bits >= 15 then this code snippet is simply
+       repeated. Over one iteration of the outermost do/while loop, this
+       happens up to six times (48 bits of input), as described in the NOTES
+       above.
+
+       However, on some little endian architectures, it can be significantly
+       faster to load 64 bits once instead of 8 bits six times:
+
+       if (bits <= 16) {
+         hold |= next_8_bytes_of_input << bits; in += 6; bits += 48;
+       }
+
+       Unlike the simpler one byte load, shifting the next_8_bytes_of_input
+       by bits will overflow and lose those high bits, up to 2 bytes' worth.
+       The conservative estimate is therefore that we have read only 6 bytes
+       (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
+       rest of the iteration, and we will not need to load another 8 bytes.
+
+       Inside this function, we no longer satisfy (hold >> bits) == 0, but
+       this is not problematic, even if that overflow does not land on an 8 bit
+       byte boundary. Those excess bits will eventually shift down lower as the
+       Huffman decoder consumes input, and when new input bits need to be loaded
+       into the bits variable, the same input bits will be or'ed over those
+       existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b).
+       Note that we therefore write that load operation as "hold |= etc" and not
+       "hold += etc".
+
+       Outside that loop, at the end of the function, hold is bitwise and'ed
+       with (1<<bits)-1 to drop those excess bits so that, on function exit, we
+       keep the invariant that (state->hold >> state->bits) == 0.
+
+       TODO: rename the bits variable to nbits, so that this block comment
+       is less confusing when discussing bits (the variable) and bits (one
+       eighth of a byte).
+       */
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+    uint64_t hold;
+#else
+    unsigned long hold;
+#endif
+
     unsigned bits;              /* local strm->bits */
     code const FAR *lcode;      /* local strm->lencode */
     code const FAR *dcode;      /* local strm->distcode */
@@ -105,10 +173,41 @@
        input data or output space */
     do {
         if (bits < 15) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+            /* Example disassembly on __x86_64__:
+               49 8b 06              mov    (%r14),%rax
+               89 f1                 mov    %esi,%ecx
+               49 83 c6 06           add    $0x6,%r14
+               83 c6 30              add    $0x30,%esi
+               48 d3 e0              shl    %cl,%rax
+               48 09 c2              or     %rax,%rdx
+             */
+            hold |= *((uint64_t *)(in)) << bits;
+            in += 6;
+            bits += 48;
+#else
+            /* Example disassembly on __x86_64__:
+               45 0f b6 45 00        movzbl 0x0(%r13),%r8d
+               89 f1                 mov    %esi,%ecx
+               49 83 c5 02           add    $0x2,%r13
+               49 d3 e0              shl    %cl,%r8
+               8d 4e 08              lea    0x8(%rsi),%ecx
+               83 c6 10              add    $0x10,%esi
+               49 01 d0              add    %rdx,%r8
+               41 0f b6 55 ff        movzbl -0x1(%r13),%edx
+               48 d3 e2              shl    %cl,%rdx
+               4c 01 c2              add    %r8,%rdx
+             */
+            /* TODO: replace "hold += etc" with "hold |= etc", here and below,
+               to be consistent with the 64 bit unaligned code path. This is
+               only a comment for now so that the commit that introduced this
+               comment has no effect whatsoever on architectures without
+               ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN. */
             hold += (unsigned long)(*in++) << bits;
             bits += 8;
             hold += (unsigned long)(*in++) << bits;
             bits += 8;
+#endif
         }
         here = lcode[hold & lmask];
       dolen:
@@ -127,8 +226,14 @@
             op &= 15;                           /* number of extra bits */
             if (op) {
                 if (bits < op) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+                    hold |= *((uint64_t *)(in)) << bits;
+                    in += 6;
+                    bits += 48;
+#else
                     hold += (unsigned long)(*in++) << bits;
                     bits += 8;
+#endif
                 }
                 len += (unsigned)hold & ((1U << op) - 1);
                 hold >>= op;
@@ -136,10 +241,16 @@
             }
             Tracevv((stderr, "inflate:         length %u\n", len));
             if (bits < 15) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+                hold |= *((uint64_t *)(in)) << bits;
+                in += 6;
+                bits += 48;
+#else
                 hold += (unsigned long)(*in++) << bits;
                 bits += 8;
                 hold += (unsigned long)(*in++) << bits;
                 bits += 8;
+#endif
             }
             here = dcode[hold & dmask];
           dodist:
@@ -151,12 +262,18 @@
                 dist = (unsigned)(here.val);
                 op &= 15;                       /* number of extra bits */
                 if (bits < op) {
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
+                    hold |= *((uint64_t *)(in)) << bits;
+                    in += 6;
+                    bits += 48;
+#else
                     hold += (unsigned long)(*in++) << bits;
                     bits += 8;
                     if (bits < op) {
                         hold += (unsigned long)(*in++) << bits;
                         bits += 8;
                     }
+#endif
                 }
                 dist += (unsigned)hold & ((1U << op) - 1);
 #ifdef INFLATE_STRICT
@@ -239,6 +356,22 @@
                             from = out - dist;  /* rest from output */
                         }
                     }
+
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
+                    if (dist >= 8) {
+                        while (1) {
+                            *((uint64_t*)(out)) = *((uint64_t*)(from));
+                            if (len <= 8) {
+                                out += len;
+                                break;
+                            }
+                            out += 8;
+                            from += 8;
+                            len -= 8;
+                        }
+                        continue;
+                    }
+#endif
                     while (len > 2) {
                         *out++ = *from++;
                         *out++ = *from++;
@@ -253,6 +386,21 @@
                 }
                 else {
                     from = out - dist;          /* copy direct from output */
+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
+                    if (dist >= 8) {
+                        while (1) {
+                            *((uint64_t*)(out)) = *((uint64_t*)(from));
+                            if (len <= 8) {
+                                out += len;
+                                break;
+                            }
+                            out += 8;
+                            from += 8;
+                            len -= 8;
+                        }
+                        continue;
+                    }
+#endif
                     do {                        /* minimum length is three */
                         *out++ = *from++;
                         *out++ = *from++;
@@ -326,4 +474,4 @@
    - Moving len -= 3 statement into middle of loop
  */

-#endif /* !ASMINF && !INFLATE_FAST64 */
+#endif /* !ASMINF && INFLATE_FAST64 */
	--- inffast.c 2017-08-25 12:05:02.309161736 +1000
	+++ contrib/inffast64/inffast64.c 2017-08-25 12:09:05.504252162 +1000
	@@ -11,8 +11,11 @@
	#ifdef ASMINF
	# pragma message("Assembler code may have bugs -- use at your own risk")
	#elif INFLATE_FAST64
	-/* Skip this implementation; use contrib/inffast64/inffast64.c instead. */
	-#else
	+
	+#if defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS) \|\| \
	+ defined(ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN)
	+#include <stdint.h>
	+#endif

	/*
	Decode literal, length, and distance codes and write out the resulting
	@@ -45,10 +48,28 @@
	Therefore if strm->avail_in >= 6, then there is enough input to avoid
	checking for available input while decoding.

	+ - On some architectures, it can be significantly faster (e.g. up to 1.2x
	+ faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
	+ time, so INFLATE_FAST_MIN_HAVE == 8. This requires a little endian
	+ architecture.
	+
	- The maximum bytes that a single length/distance pair can output is 258
	bytes, which is the maximum length that can be coded. inflate_fast()
	requires strm->avail_out >= 258 for each loop to avoid checking for
	output space.
	+
	+ - On some architectures, for length-distance copies, we similarly load and
	+ store 8 bytes at a time, if the distance is at least 8. Again, this can
	+ be significantly faster (e.g. up to 1.3x faster on x86_64). Rounding up
	+ to a multiple of 8 gives INFLATE_FAST_MIN_LEFT == 264. This does not
	+ require a little endian architecture. This always copies 8*L bytes (where
	+ L is the smallest integer such that 8*L >= len, i.e. we round length up
	+ to a multiple of 8), instead of only len bytes, but that's OK, as
	+ subsequent iterations will fix the overrun.
	+
	+ - Combining those two optimizations for 64 bit unaligned loads gives up to
	+ a 1.5x throughput improvement on x86_64.
	+
	*/
	void ZLIB_INTERNAL inflate_fast(strm, start)
	z_streamp strm;
	@@ -67,7 +88,54 @@
	unsigned whave; /* valid bytes in the window */
	unsigned wnext; /* window write index */
	unsigned char FAR window; / allocated sliding window, if wsize != 0 */
	- unsigned long hold; /* local strm->hold */
	+
	+ /* hold is a local copy of strm->hold. By default, hold satisfies the same
	+ invariants that strm->hold does, namely that (hold >> bits) == 0. This
	+ invariant is kept by loading bits into hold one byte at a time, like:
	+
	+ hold \|= next_byte_of_input << bits; in++; bits += 8;
	+
	+ If we need to ensure that bits >= 15 then this code snippet is simply
	+ repeated. Over one iteration of the outermost do/while loop, this
	+ happens up to six times (48 bits of input), as described in the NOTES
	+ above.
	+
	+ However, on some little endian architectures, it can be significantly
	+ faster to load 64 bits once instead of 8 bits six times:
	+
	+ if (bits <= 16) {
	+ hold \|= next_8_bytes_of_input << bits; in += 6; bits += 48;
	+ }
	+
	+ Unlike the simpler one byte load, shifting the next_8_bytes_of_input
	+ by bits will overflow and lose those high bits, up to 2 bytes' worth.
	+ The conservative estimate is therefore that we have read only 6 bytes
	+ (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
	+ rest of the iteration, and we will not need to load another 8 bytes.
	+
	+ Inside this function, we no longer satisfy (hold >> bits) == 0, but
	+ this is not problematic, even if that overflow does not land on an 8 bit
	+ byte boundary. Those excess bits will eventually shift down lower as the
	+ Huffman decoder consumes input, and when new input bits need to be loaded
	+ into the bits variable, the same input bits will be or'ed over those
	+ existing bits. A bitwise or is idempotent: (a \| b \| b) equals (a \| b).
	+ Note that we therefore write that load operation as "hold \|= etc" and not
	+ "hold += etc".
	+
	+ Outside that loop, at the end of the function, hold is bitwise and'ed
	+ with (1<<bits)-1 to drop those excess bits so that, on function exit, we
	+ keep the invariant that (state->hold >> state->bits) == 0.
	+
	+ TODO: rename the bits variable to nbits, so that this block comment
	+ is less confusing when discussing bits (the variable) and bits (one
	+ eighth of a byte).
	+ */
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ uint64_t hold;
	+#else
	+ unsigned long hold;
	+#endif
	+
	unsigned bits; /* local strm->bits */
	code const FAR lcode; / local strm->lencode */
	code const FAR dcode; / local strm->distcode */
	@@ -105,10 +173,41 @@
	input data or output space */
	do {
	if (bits < 15) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ /* Example disassembly on __x86_64__:
	+ 49 8b 06 mov (%r14),%rax
	+ 89 f1 mov %esi,%ecx
	+ 49 83 c6 06 add $0x6,%r14
	+ 83 c6 30 add $0x30,%esi
	+ 48 d3 e0 shl %cl,%rax
	+ 48 09 c2 or %rax,%rdx
	+ */
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	+ /* Example disassembly on __x86_64__:
	+ 45 0f b6 45 00 movzbl 0x0(%r13),%r8d
	+ 89 f1 mov %esi,%ecx
	+ 49 83 c5 02 add $0x2,%r13
	+ 49 d3 e0 shl %cl,%r8
	+ 8d 4e 08 lea 0x8(%rsi),%ecx
	+ 83 c6 10 add $0x10,%esi
	+ 49 01 d0 add %rdx,%r8
	+ 41 0f b6 55 ff movzbl -0x1(%r13),%edx
	+ 48 d3 e2 shl %cl,%rdx
	+ 4c 01 c2 add %r8,%rdx
	+ */
	+ /* TODO: replace "hold += etc" with "hold \|= etc", here and below,
	+ to be consistent with the 64 bit unaligned code path. This is
	+ only a comment for now so that the commit that introduced this
	+ comment has no effect whatsoever on architectures without
	+ ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN. */
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	+#endif
	}
	here = lcode[hold & lmask];
	dolen:
	@@ -127,8 +226,14 @@
	op &= 15; /* number of extra bits */
	if (op) {
	if (bits < op) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	+#endif
	}
	len += (unsigned)hold & ((1U << op) - 1);
	hold >>= op;
	@@ -136,10 +241,16 @@
	}
	Tracevv((stderr, "inflate: length %u\n", len));
	if (bits < 15) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	+#endif
	}
	here = dcode[hold & dmask];
	dodist:
	@@ -151,12 +262,18 @@
	dist = (unsigned)(here.val);
	op &= 15; /* number of extra bits */
	if (bits < op) {
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS_LITTLE_ENDIAN
	+ hold \|= ((uint64_t )(in)) << bits;
	+ in += 6;
	+ bits += 48;
	+#else
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	if (bits < op) {
	hold += (unsigned long)(*in++) << bits;
	bits += 8;
	}
	+#endif
	}
	dist += (unsigned)hold & ((1U << op) - 1);
	#ifdef INFLATE_STRICT
	@@ -239,6 +356,22 @@
	from = out - dist; /* rest from output */
	}
	}
	+
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
	+ if (dist >= 8) {
	+ while (1) {
	+ ((uint64_t)(out)) = ((uint64_t)(from));
	+ if (len <= 8) {
	+ out += len;
	+ break;
	+ }
	+ out += 8;
	+ from += 8;
	+ len -= 8;
	+ }
	+ continue;
	+ }
	+#endif
	while (len > 2) {
	out++ = from++;
	out++ = from++;
	@@ -253,6 +386,21 @@
	}
	else {
	from = out - dist; /* copy direct from output */
	+#ifdef ZLIB_INTERNAL_HAVE_64_BIT_UNALIGNED_LOADS
	+ if (dist >= 8) {
	+ while (1) {
	+ ((uint64_t)(out)) = ((uint64_t)(from));
	+ if (len <= 8) {
	+ out += len;
	+ break;
	+ }
	+ out += 8;
	+ from += 8;
	+ len -= 8;
	+ }
	+ continue;
	+ }
	+#endif
	do { /* minimum length is three */
	out++ = from++;
	out++ = from++;
	@@ -326,4 +474,4 @@
	- Moving len -= 3 statement into middle of loop
	*/

	-#endif /* !ASMINF && !INFLATE_FAST64 */
	+#endif /* !ASMINF && INFLATE_FAST64 */