Skip to content

Instantly share code, notes, and snippets.

@ilyakurdyukov
Last active June 18, 2023 17:04
Show Gist options
  • Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
Faster LZMA decoder for x86 CPUs (patch for XZ Utils).
From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Tue, 14 Dec 2021 21:54:32 +0700
Subject: [PATCH] faster lzma_decoder for x86
Notice: Uses inline assembly with CMOV instruction.
Another change that removes the comparison with in_size can give a few
percent speedup for architectures with a small number of registers.
---
src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++-------------
src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++--
2 files changed, 113 insertions(+), 43 deletions(-)
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index e605a0a..ecb804b 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
= offset + match_bit
+ symbol;
- rc_bit(probs[subcoder_index],
- offset &= ~match_bit,
- offset &= match_bit,
+ rc_bit_matched(probs[subcoder_index],
SEQ_LITERAL_MATCHED);
// It seems to be faster to do this
@@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
case seq: \
match_bit = len & offset; \
subcoder_index = offset + match_bit + symbol; \
- rc_bit(probs[subcoder_index], \
- offset &= ~match_bit, \
- offset &= match_bit, \
- seq)
+ rc_bit_matched(probs[subcoder_index], seq)
d(SEQ_LITERAL_MATCHED0);
len <<= 1;
@@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
probs = coder->pos_special + rep0
- symbol - 1;
symbol = 1;
- offset = 0;
- case SEQ_DIST_MODEL:
+ offset = 1;
#ifdef HAVE_SMALL
+ limit = 1 << limit;
+ case SEQ_DIST_MODEL:
do {
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- } while (++offset < limit);
+ offset <<= 1;
+ } while (offset < limit);
#else
+ case SEQ_DIST_MODEL:
switch (limit) {
case 5:
assert(offset == 0);
- rc_bit(probs[symbol], ,
- rep0 += 1U,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 4:
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 3:
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 2:
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 1:
// We need "symbol" only for
@@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
// rc_bit_last() here to omit
// the unneeded updating of
// "symbol".
- rc_bit_last(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist_last(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
}
#endif
@@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
rep0 <<= ALIGN_BITS;
symbol = 1;
#ifdef HAVE_SMALL
- offset = 0;
+ offset = 1;
case SEQ_ALIGN:
do {
- rc_bit(coder->pos_align[
- symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(coder->pos_align[
+ symbol],
+ offset,
SEQ_ALIGN);
- } while (++offset < ALIGN_BITS);
+ offset <<= 1;
+ } while (offset < (1 << ALIGN_BITS));
#else
case SEQ_ALIGN0:
- rc_bit(coder->pos_align[symbol], ,
- rep0 += 1, SEQ_ALIGN0);
+ rc_bit_dist(coder->pos_align[symbol],
+ 1, SEQ_ALIGN0);
case SEQ_ALIGN1:
- rc_bit(coder->pos_align[symbol], ,
- rep0 += 2, SEQ_ALIGN1);
+ rc_bit_dist(coder->pos_align[symbol],
+ 2, SEQ_ALIGN1);
case SEQ_ALIGN2:
- rc_bit(coder->pos_align[symbol], ,
- rep0 += 4, SEQ_ALIGN2);
+ rc_bit_dist(coder->pos_align[symbol],
+ 4, SEQ_ALIGN2);
case SEQ_ALIGN3:
// Like in SEQ_DIST_MODEL, we don't
// need "symbol" for anything else
// than indexing the probability array.
- rc_bit_last(coder->pos_align[symbol], ,
- rep0 += 8, SEQ_ALIGN3);
+ rc_bit_dist_last(coder->pos_align[
+ symbol],
+ 8, SEQ_ALIGN3);
#endif
if (rep0 == UINT32_MAX) {
diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
index e0b051f..cc9ac35 100644
--- a/src/liblzma/rangecoder/range_decoder.h
+++ b/src/liblzma/rangecoder/range_decoder.h
@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
/// variables `in' and `in_size' to be defined.
#define rc_to_local(range_decoder, in_pos) \
lzma_range_decoder rc = range_decoder; \
- size_t rc_in_pos = (in_pos); \
+ size_t rc_in_pos = in_pos - in_size; \
+ const uint8_t *restrict rc_end = in + in_size; \
uint32_t rc_bound
@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
#define rc_from_local(range_decoder, in_pos) \
do { \
range_decoder = rc; \
- in_pos = rc_in_pos; \
+ in_pos = rc_in_pos + in_size; \
} while (0)
@@ -87,12 +88,13 @@ do { \
#define rc_normalize(seq) \
do { \
if (rc.range < RC_TOP_VALUE) { \
- if (unlikely(rc_in_pos == in_size)) { \
+ if (unlikely(!rc_in_pos)) { \
coder->sequence = seq; \
goto out; \
} \
+ rc.code <<= RC_SHIFT_BITS; \
rc.range <<= RC_SHIFT_BITS; \
- rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
+ rc.code |= rc_end[rc_in_pos++]; \
} \
} while (0)
@@ -151,11 +153,79 @@ do { \
/// Decodes one bit, updates "symbol", and runs action0 or action1 depending
/// on the decoded bit.
+#ifndef NO_BRANCH_OPT
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
+do { \
+ rc_normalize(seq); \
+ uint32_t cache = (prob), tmp; \
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
+ rc.range -= rc_bound; \
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
+ __asm__ ( \
+ "cmpl %3, %2\n\t" \
+ "cmovbl %3, %1\n\t" \
+ "sbbl %0, %0" \
+ : "=&r"(tmp), "+&r"(rc.range) \
+ : "r"(rc.code), "r"(rc_bound) \
+ : "flags" \
+ ); \
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
+ prob -= cache >> RC_MOVE_BITS; \
+ pre0; tmp = ~tmp; pre1; \
+ rc.code -= tmp & rc_bound; \
+ if (!tmp) { action0; } else { action1; }; \
+} while (0)
+#elif defined(__GNUC__) && defined(__aarch64__)
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
+do { \
+ rc_normalize(seq); \
+ uint32_t cache = (prob), tmp; \
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
+ rc.range -= rc_bound; \
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
+ __asm__ ( \
+ "cmp %w2, %w3\n\t" \
+ "csel %w1, %w3, %w1, lo\n\t" \
+ "csetm %w0, lo" \
+ : "=&r"(tmp), "+&r"(rc.range) \
+ : "r"(rc.code), "r"(rc_bound) \
+ : "cc" \
+ ); \
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
+ prob -= cache >> RC_MOVE_BITS; \
+ pre0; tmp = ~tmp; pre1; \
+ rc.code -= tmp & rc_bound; \
+ if (!tmp) { action0; } else { action1; }; \
+} while (0)
+#endif
+#endif
+#ifdef rc_bit_nobranch
+#define rc_bit(prob, action0, action1, seq) \
+ rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
+ action0, action1, seq)
+#define rc_bit_matched(prob, seq) \
+ rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
+ symbol = (symbol << 1) - tmp, , , seq)
+#define rc_bit_dist(prob, offset, seq) \
+ rc_bit_nobranch(prob, , \
+ symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
+ , , seq);
+#define rc_bit_dist_last(prob, offset, seq) \
+ rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
+#else
#define rc_bit(prob, action0, action1, seq) \
rc_bit_last(prob, \
symbol <<= 1; action0, \
symbol = (symbol << 1) + 1; action1, \
seq);
+#define rc_bit_matched(prob, seq) \
+ rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
+#define rc_bit_dist(prob, offset, seq) \
+ rc_bit(prob, , rep0 += offset, seq);
+#define rc_bit_dist_last(prob, offset, seq) \
+ rc_bit_last(prob, , rep0 += offset, seq)
+#endif
/// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
--
2.17.1
@ilyakurdyukov
Copy link
Author

I wrote scripts to make testing easier:

# downloads test files and builds XZ
./build.sh
# tests decompression speed
./bench.sh ref
./bench.sh new

build.sh

#!/bin/bash
set -e

# patch
[ -f faster_lzma_decoder_x86.patch ] || wget https://gist.githubusercontent.com/ilyakurdyukov/f514418f3affd677e1ac408ec0c09188/raw/faster_lzma_decoder_x86.patch
# test data
[ -f linux-5.15.7.tar.xz ] || wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.7.tar.xz
[ -f linux-firmware-20211027.tar.xz ] || wget https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz

[ -d xz ] || git clone http://git.tukaani.org/xz.git
mkdir -p ref
mkdir -p new

# build ref

cp -r xz build
cd build
# or cmake as an option
./autogen.sh --no-po4a
./configure
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../ref/

# build new

patch -p1 < ../faster_lzma_decoder_x86.patch
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../new/
cd ..

bench.sh

#!/bin/bash

dir=${1:-"ref"}
export LD_LIBRARY_PATH=$(pwd)/$dir
# to make sure the correct liblzma is loaded 
ldd $dir/xz | grep liblzma

# or "perf stat -r 5" instead of "for" and "time -p"

for i in 1 2 3; do
	echo "# ($dir:$i) linux-5.15.7"
	time -p $dir/xz -c -d linux-5.15.7.tar.xz > /dev/null
	echo "# ($dir:$i) linux-firmware-20211027"
	time -p $dir/xz -c -d linux-firmware-20211027.tar.xz > /dev/null
done

@ilyakurdyukov
Copy link
Author

Kunpeng-920 (AArch64)

linux-5.15.7.tar.xz : 8.82 --> 8.78 (+0.4%)
linux-firmware-20211027.tar.xz : 8.97 --> 8.44 (+6.2%)

@ilyakurdyukov
Copy link
Author

I got a report that this patch is slowing decoding on the Apple M1, but I am unable to investigate this on my own.

Well, at least it will be easy to exclude it for Apple M1 by adding some #ifndef.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment