ilyakurdyukov/faster_lzma_decoder_x86.patch

## faster_lzma_decoder_x86.patch
From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Tue, 14 Dec 2021 21:54:32 +0700
Subject: [PATCH] faster lzma_decoder for x86

Notice: Uses inline assembly with CMOV instruction.

Another change that removes the comparison with in_size can give a few
percent speedup for architectures with a small number of registers.
---
 src/liblzma/lzma/lzma_decoder.c        | 78 +++++++++++++-------------
 src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++--
 2 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index e605a0a..ecb804b 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							= offset + match_bit
 							+ symbol;

-					rc_bit(probs[subcoder_index],
-							offset &= ~match_bit,
-							offset &= match_bit,
+					rc_bit_matched(probs[subcoder_index],
 							SEQ_LITERAL_MATCHED);

 					// It seems to be faster to do this
@@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		case seq: \
 			match_bit = len & offset; \
 			subcoder_index = offset + match_bit + symbol; \
-			rc_bit(probs[subcoder_index], \
-					offset &= ~match_bit, \
-					offset &= match_bit, \
-					seq)
+			rc_bit_matched(probs[subcoder_index], seq)

 				d(SEQ_LITERAL_MATCHED0);
 				len <<= 1;
@@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					probs = coder->pos_special + rep0
 							- symbol - 1;
 					symbol = 1;
-					offset = 0;
-	case SEQ_DIST_MODEL:
+					offset = 1;
 #ifdef HAVE_SMALL
+					limit = 1 << limit;
+	case SEQ_DIST_MODEL:
 					do {
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
+						rc_bit_dist(probs[symbol],
+							offset,
 							SEQ_DIST_MODEL);
-					} while (++offset < limit);
+						offset <<= 1;
+					} while (offset < limit);
 #else
+	case SEQ_DIST_MODEL:
 					switch (limit) {
 					case 5:
 						assert(offset == 0);
-						rc_bit(probs[symbol], ,
-							rep0 += 1U,
+						rc_bit_dist(probs[symbol],
+							offset,
 							SEQ_DIST_MODEL);
-						++offset;
+						offset <<= 1;
 						--limit;
 					case 4:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
+						rc_bit_dist(probs[symbol],
+							offset,
 							SEQ_DIST_MODEL);
-						++offset;
+						offset <<= 1;
 						--limit;
 					case 3:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
+						rc_bit_dist(probs[symbol],
+							offset,
 							SEQ_DIST_MODEL);
-						++offset;
+						offset <<= 1;
 						--limit;
 					case 2:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
+						rc_bit_dist(probs[symbol],
+							offset,
 							SEQ_DIST_MODEL);
-						++offset;
+						offset <<= 1;
 						--limit;
 					case 1:
 						// We need "symbol" only for
@@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 						// rc_bit_last() here to omit
 						// the unneeded updating of
 						// "symbol".
-						rc_bit_last(probs[symbol], ,
-							rep0 += 1U << offset,
+						rc_bit_dist_last(probs[symbol],
+							offset,
 							SEQ_DIST_MODEL);
 					}
 #endif
@@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					rep0 <<= ALIGN_BITS;
 					symbol = 1;
 #ifdef HAVE_SMALL
-					offset = 0;
+					offset = 1;
 	case SEQ_ALIGN:
 					do {
-						rc_bit(coder->pos_align[
-								symbol], ,
-							rep0 += 1U << offset,
+						rc_bit_dist(coder->pos_align[
+								symbol],
+							offset,
 							SEQ_ALIGN);
-					} while (++offset < ALIGN_BITS);
+						offset <<= 1;
+					} while (offset < (1 << ALIGN_BITS));
 #else
 	case SEQ_ALIGN0:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 1, SEQ_ALIGN0);
+					rc_bit_dist(coder->pos_align[symbol],
+							1, SEQ_ALIGN0);
 	case SEQ_ALIGN1:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 2, SEQ_ALIGN1);
+					rc_bit_dist(coder->pos_align[symbol],
+							2, SEQ_ALIGN1);
 	case SEQ_ALIGN2:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 4, SEQ_ALIGN2);
+					rc_bit_dist(coder->pos_align[symbol],
+							4, SEQ_ALIGN2);
 	case SEQ_ALIGN3:
 					// Like in SEQ_DIST_MODEL, we don't
 					// need "symbol" for anything else
 					// than indexing the probability array.
-					rc_bit_last(coder->pos_align[symbol], ,
-							rep0 += 8, SEQ_ALIGN3);
+					rc_bit_dist_last(coder->pos_align[
+								symbol],
+							8, SEQ_ALIGN3);
 #endif

 					if (rep0 == UINT32_MAX) {
diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
index e0b051f..cc9ac35 100644
--- a/src/liblzma/rangecoder/range_decoder.h
+++ b/src/liblzma/rangecoder/range_decoder.h
@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
 /// variables `in' and `in_size' to be defined.
 #define rc_to_local(range_decoder, in_pos) \
 	lzma_range_decoder rc = range_decoder; \
-	size_t rc_in_pos = (in_pos); \
+	size_t rc_in_pos = in_pos - in_size; \
+	const uint8_t *restrict rc_end = in + in_size; \
 	uint32_t rc_bound


@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
 #define rc_from_local(range_decoder, in_pos) \
 do { \
 	range_decoder = rc; \
-	in_pos = rc_in_pos; \
+	in_pos = rc_in_pos + in_size; \
 } while (0)


@@ -87,12 +88,13 @@ do { \
 #define rc_normalize(seq) \
 do { \
 	if (rc.range < RC_TOP_VALUE) { \
-		if (unlikely(rc_in_pos == in_size)) { \
+		if (unlikely(!rc_in_pos)) { \
 			coder->sequence = seq; \
 			goto out; \
 		} \
+		rc.code <<= RC_SHIFT_BITS; \
 		rc.range <<= RC_SHIFT_BITS; \
-		rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
+		rc.code |= rc_end[rc_in_pos++]; \
 	} \
 } while (0)

@@ -151,11 +153,79 @@ do { \

 /// Decodes one bit, updates "symbol", and runs action0 or action1 depending
 /// on the decoded bit.
+#ifndef NO_BRANCH_OPT
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
+do { \
+	rc_normalize(seq); \
+	uint32_t cache = (prob), tmp; \
+	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
+	rc.range -= rc_bound; \
+	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
+	__asm__ ( \
+		"cmpl %3, %2\n\t" \
+		"cmovbl %3, %1\n\t" \
+		"sbbl %0, %0" \
+		: "=&r"(tmp), "+&r"(rc.range) \
+		: "r"(rc.code), "r"(rc_bound) \
+		: "flags" \
+	); \
+	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
+	prob -= cache >> RC_MOVE_BITS; \
+	pre0; tmp = ~tmp; pre1; \
+	rc.code -= tmp & rc_bound; \
+	if (!tmp) { action0; } else { action1; }; \
+} while (0)
+#elif defined(__GNUC__) && defined(__aarch64__)
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
+do { \
+	rc_normalize(seq); \
+	uint32_t cache = (prob), tmp; \
+	rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
+	rc.range -= rc_bound; \
+	/* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
+	__asm__ ( \
+		"cmp %w2, %w3\n\t" \
+		"csel %w1, %w3, %w1, lo\n\t" \
+		"csetm %w0, lo" \
+		: "=&r"(tmp), "+&r"(rc.range) \
+		: "r"(rc.code), "r"(rc_bound) \
+		: "cc" \
+	); \
+	cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
+	prob -= cache >> RC_MOVE_BITS; \
+	pre0; tmp = ~tmp; pre1; \
+	rc.code -= tmp & rc_bound; \
+	if (!tmp) { action0; } else { action1; }; \
+} while (0)
+#endif
+#endif
+#ifdef rc_bit_nobranch
+#define rc_bit(prob, action0, action1, seq) \
+	rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
+		action0, action1, seq)
+#define rc_bit_matched(prob, seq) \
+	rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
+		symbol = (symbol << 1) - tmp, , , seq)
+#define rc_bit_dist(prob, offset, seq) \
+	rc_bit_nobranch(prob, , \
+		symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
+		, , seq);
+#define rc_bit_dist_last(prob, offset, seq) \
+	rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
+#else
 #define rc_bit(prob, action0, action1, seq) \
 	rc_bit_last(prob, \
 		symbol <<= 1; action0, \
 		symbol = (symbol << 1) + 1; action1, \
 		seq);
+#define rc_bit_matched(prob, seq) \
+	rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
+#define rc_bit_dist(prob, offset, seq) \
+	rc_bit(prob, , rep0 += offset, seq);
+#define rc_bit_dist_last(prob, offset, seq) \
+	rc_bit_last(prob, , rep0 += offset, seq)
+#endif


 /// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
--
2.17.1
	From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
	From: Ilya Kurdyukov <jpegqs@gmail.com>
	Date: Tue, 14 Dec 2021 21:54:32 +0700
	Subject: [PATCH] faster lzma_decoder for x86

	Notice: Uses inline assembly with CMOV instruction.

	Another change that removes the comparison with in_size can give a few
	percent speedup for architectures with a small number of registers.
	---
	src/liblzma/lzma/lzma_decoder.c \| 78 +++++++++++++-------------
	src/liblzma/rangecoder/range_decoder.h \| 78 ++++++++++++++++++++++++--
	2 files changed, 113 insertions(+), 43 deletions(-)

	diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
	index e605a0a..ecb804b 100644
	--- a/src/liblzma/lzma/lzma_decoder.c
	+++ b/src/liblzma/lzma/lzma_decoder.c
	@@ -415,9 +415,7 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	= offset + match_bit
	+ symbol;

	- rc_bit(probs[subcoder_index],
	- offset &= ~match_bit,
	- offset &= match_bit,
	+ rc_bit_matched(probs[subcoder_index],
	SEQ_LITERAL_MATCHED);

	// It seems to be faster to do this
	@@ -437,10 +435,7 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	case seq: \
	match_bit = len & offset; \
	subcoder_index = offset + match_bit + symbol; \
	- rc_bit(probs[subcoder_index], \
	- offset &= ~match_bit, \
	- offset &= match_bit, \
	- seq)
	+ rc_bit_matched(probs[subcoder_index], seq)

	d(SEQ_LITERAL_MATCHED0);
	len <<= 1;
	@@ -564,40 +559,43 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	probs = coder->pos_special + rep0
	- symbol - 1;
	symbol = 1;
	- offset = 0;
	- case SEQ_DIST_MODEL:
	+ offset = 1;
	#ifdef HAVE_SMALL
	+ limit = 1 << limit;
	+ case SEQ_DIST_MODEL:
	do {
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- } while (++offset < limit);
	+ offset <<= 1;
	+ } while (offset < limit);
	#else
	+ case SEQ_DIST_MODEL:
	switch (limit) {
	case 5:
	assert(offset == 0);
	- rc_bit(probs[symbol], ,
	- rep0 += 1U,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 4:
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 3:
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 2:
	- rc_bit(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	- ++offset;
	+ offset <<= 1;
	--limit;
	case 1:
	// We need "symbol" only for
	@@ -606,8 +604,8 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	// rc_bit_last() here to omit
	// the unneeded updating of
	// "symbol".
	- rc_bit_last(probs[symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist_last(probs[symbol],
	+ offset,
	SEQ_DIST_MODEL);
	}
	#endif
	@@ -630,30 +628,32 @@ lzma_decode(void coder_ptr, lzma_dict restrict dictptr,
	rep0 <<= ALIGN_BITS;
	symbol = 1;
	#ifdef HAVE_SMALL
	- offset = 0;
	+ offset = 1;
	case SEQ_ALIGN:
	do {
	- rc_bit(coder->pos_align[
	- symbol], ,
	- rep0 += 1U << offset,
	+ rc_bit_dist(coder->pos_align[
	+ symbol],
	+ offset,
	SEQ_ALIGN);
	- } while (++offset < ALIGN_BITS);
	+ offset <<= 1;
	+ } while (offset < (1 << ALIGN_BITS));
	#else
	case SEQ_ALIGN0:
	- rc_bit(coder->pos_align[symbol], ,
	- rep0 += 1, SEQ_ALIGN0);
	+ rc_bit_dist(coder->pos_align[symbol],
	+ 1, SEQ_ALIGN0);
	case SEQ_ALIGN1:
	- rc_bit(coder->pos_align[symbol], ,
	- rep0 += 2, SEQ_ALIGN1);
	+ rc_bit_dist(coder->pos_align[symbol],
	+ 2, SEQ_ALIGN1);
	case SEQ_ALIGN2:
	- rc_bit(coder->pos_align[symbol], ,
	- rep0 += 4, SEQ_ALIGN2);
	+ rc_bit_dist(coder->pos_align[symbol],
	+ 4, SEQ_ALIGN2);
	case SEQ_ALIGN3:
	// Like in SEQ_DIST_MODEL, we don't
	// need "symbol" for anything else
	// than indexing the probability array.
	- rc_bit_last(coder->pos_align[symbol], ,
	- rep0 += 8, SEQ_ALIGN3);
	+ rc_bit_dist_last(coder->pos_align[
	+ symbol],
	+ 8, SEQ_ALIGN3);
	#endif

	if (rep0 == UINT32_MAX) {
	diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
	index e0b051f..cc9ac35 100644
	--- a/src/liblzma/rangecoder/range_decoder.h
	+++ b/src/liblzma/rangecoder/range_decoder.h
	@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder rc, const uint8_t restrict in,
	/// variables `in' and `in_size' to be defined.
	#define rc_to_local(range_decoder, in_pos) \
	lzma_range_decoder rc = range_decoder; \
	- size_t rc_in_pos = (in_pos); \
	+ size_t rc_in_pos = in_pos - in_size; \
	+ const uint8_t *restrict rc_end = in + in_size; \
	uint32_t rc_bound


	@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder rc, const uint8_t restrict in,
	#define rc_from_local(range_decoder, in_pos) \
	do { \
	range_decoder = rc; \
	- in_pos = rc_in_pos; \
	+ in_pos = rc_in_pos + in_size; \
	} while (0)


	@@ -87,12 +88,13 @@ do { \
	#define rc_normalize(seq) \
	do { \
	if (rc.range < RC_TOP_VALUE) { \
	- if (unlikely(rc_in_pos == in_size)) { \
	+ if (unlikely(!rc_in_pos)) { \
	coder->sequence = seq; \
	goto out; \
	} \
	+ rc.code <<= RC_SHIFT_BITS; \
	rc.range <<= RC_SHIFT_BITS; \
	- rc.code = (rc.code << RC_SHIFT_BITS) \| in[rc_in_pos++]; \
	+ rc.code \|= rc_end[rc_in_pos++]; \
	} \
	} while (0)

	@@ -151,11 +153,79 @@ do { \

	/// Decodes one bit, updates "symbol", and runs action0 or action1 depending
	/// on the decoded bit.
	+#ifndef NO_BRANCH_OPT
	+#if defined(__GNUC__) && (defined(__i386__) \|\| defined(__x86_64__))
	+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
	+do { \
	+ rc_normalize(seq); \
	+ uint32_t cache = (prob), tmp; \
	+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
	+ rc.range -= rc_bound; \
	+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
	+ __asm__ ( \
	+ "cmpl %3, %2\n\t" \
	+ "cmovbl %3, %1\n\t" \
	+ "sbbl %0, %0" \
	+ : "=&r"(tmp), "+&r"(rc.range) \
	+ : "r"(rc.code), "r"(rc_bound) \
	+ : "flags" \
	+ ); \
	+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
	+ prob -= cache >> RC_MOVE_BITS; \
	+ pre0; tmp = ~tmp; pre1; \
	+ rc.code -= tmp & rc_bound; \
	+ if (!tmp) { action0; } else { action1; }; \
	+} while (0)
	+#elif defined(__GNUC__) && defined(__aarch64__)
	+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
	+do { \
	+ rc_normalize(seq); \
	+ uint32_t cache = (prob), tmp; \
	+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
	+ rc.range -= rc_bound; \
	+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
	+ __asm__ ( \
	+ "cmp %w2, %w3\n\t" \
	+ "csel %w1, %w3, %w1, lo\n\t" \
	+ "csetm %w0, lo" \
	+ : "=&r"(tmp), "+&r"(rc.range) \
	+ : "r"(rc.code), "r"(rc_bound) \
	+ : "cc" \
	+ ); \
	+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
	+ prob -= cache >> RC_MOVE_BITS; \
	+ pre0; tmp = ~tmp; pre1; \
	+ rc.code -= tmp & rc_bound; \
	+ if (!tmp) { action0; } else { action1; }; \
	+} while (0)
	+#endif
	+#endif
	+#ifdef rc_bit_nobranch
	+#define rc_bit(prob, action0, action1, seq) \
	+ rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
	+ action0, action1, seq)
	+#define rc_bit_matched(prob, seq) \
	+ rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
	+ symbol = (symbol << 1) - tmp, , , seq)
	+#define rc_bit_dist(prob, offset, seq) \
	+ rc_bit_nobranch(prob, , \
	+ symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
	+ , , seq);
	+#define rc_bit_dist_last(prob, offset, seq) \
	+ rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
	+#else
	#define rc_bit(prob, action0, action1, seq) \
	rc_bit_last(prob, \
	symbol <<= 1; action0, \
	symbol = (symbol << 1) + 1; action1, \
	seq);
	+#define rc_bit_matched(prob, seq) \
	+ rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
	+#define rc_bit_dist(prob, offset, seq) \
	+ rc_bit(prob, , rep0 += offset, seq);
	+#define rc_bit_dist_last(prob, offset, seq) \
	+ rc_bit_last(prob, , rep0 += offset, seq)
	+#endif


	/// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
	--
	2.17.1