Skip to content

Instantly share code, notes, and snippets.

@ilyakurdyukov
Last active June 18, 2023 17:04
Show Gist options
  • Star 15 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
Save ilyakurdyukov/f514418f3affd677e1ac408ec0c09188 to your computer and use it in GitHub Desktop.
Faster LZMA decoder for x86 CPUs (patch for XZ Utils).
From 387fd25f57f41009fc317f7922e957de9f370ff2 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Tue, 14 Dec 2021 21:54:32 +0700
Subject: [PATCH] faster lzma_decoder for x86
Notice: Uses inline assembly with CMOV instruction.
Another change that removes the comparison with in_size can give a few
percent speedup for architectures with a small number of registers.
---
src/liblzma/lzma/lzma_decoder.c | 78 +++++++++++++-------------
src/liblzma/rangecoder/range_decoder.h | 78 ++++++++++++++++++++++++--
2 files changed, 113 insertions(+), 43 deletions(-)
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index e605a0a..ecb804b 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -415,9 +415,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
= offset + match_bit
+ symbol;
- rc_bit(probs[subcoder_index],
- offset &= ~match_bit,
- offset &= match_bit,
+ rc_bit_matched(probs[subcoder_index],
SEQ_LITERAL_MATCHED);
// It seems to be faster to do this
@@ -437,10 +435,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
case seq: \
match_bit = len & offset; \
subcoder_index = offset + match_bit + symbol; \
- rc_bit(probs[subcoder_index], \
- offset &= ~match_bit, \
- offset &= match_bit, \
- seq)
+ rc_bit_matched(probs[subcoder_index], seq)
d(SEQ_LITERAL_MATCHED0);
len <<= 1;
@@ -564,40 +559,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
probs = coder->pos_special + rep0
- symbol - 1;
symbol = 1;
- offset = 0;
- case SEQ_DIST_MODEL:
+ offset = 1;
#ifdef HAVE_SMALL
+ limit = 1 << limit;
+ case SEQ_DIST_MODEL:
do {
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- } while (++offset < limit);
+ offset <<= 1;
+ } while (offset < limit);
#else
+ case SEQ_DIST_MODEL:
switch (limit) {
case 5:
assert(offset == 0);
- rc_bit(probs[symbol], ,
- rep0 += 1U,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 4:
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 3:
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 2:
- rc_bit(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
- ++offset;
+ offset <<= 1;
--limit;
case 1:
// We need "symbol" only for
@@ -606,8 +604,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
// rc_bit_last() here to omit
// the unneeded updating of
// "symbol".
- rc_bit_last(probs[symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist_last(probs[symbol],
+ offset,
SEQ_DIST_MODEL);
}
#endif
@@ -630,30 +628,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
rep0 <<= ALIGN_BITS;
symbol = 1;
#ifdef HAVE_SMALL
- offset = 0;
+ offset = 1;
case SEQ_ALIGN:
do {
- rc_bit(coder->pos_align[
- symbol], ,
- rep0 += 1U << offset,
+ rc_bit_dist(coder->pos_align[
+ symbol],
+ offset,
SEQ_ALIGN);
- } while (++offset < ALIGN_BITS);
+ offset <<= 1;
+ } while (offset < (1 << ALIGN_BITS));
#else
case SEQ_ALIGN0:
- rc_bit(coder->pos_align[symbol], ,
- rep0 += 1, SEQ_ALIGN0);
+ rc_bit_dist(coder->pos_align[symbol],
+ 1, SEQ_ALIGN0);
case SEQ_ALIGN1:
- rc_bit(coder->pos_align[symbol], ,
- rep0 += 2, SEQ_ALIGN1);
+ rc_bit_dist(coder->pos_align[symbol],
+ 2, SEQ_ALIGN1);
case SEQ_ALIGN2:
- rc_bit(coder->pos_align[symbol], ,
- rep0 += 4, SEQ_ALIGN2);
+ rc_bit_dist(coder->pos_align[symbol],
+ 4, SEQ_ALIGN2);
case SEQ_ALIGN3:
// Like in SEQ_DIST_MODEL, we don't
// need "symbol" for anything else
// than indexing the probability array.
- rc_bit_last(coder->pos_align[symbol], ,
- rep0 += 8, SEQ_ALIGN3);
+ rc_bit_dist_last(coder->pos_align[
+ symbol],
+ 8, SEQ_ALIGN3);
#endif
if (rep0 == UINT32_MAX) {
diff --git a/src/liblzma/rangecoder/range_decoder.h b/src/liblzma/rangecoder/range_decoder.h
index e0b051f..cc9ac35 100644
--- a/src/liblzma/rangecoder/range_decoder.h
+++ b/src/liblzma/rangecoder/range_decoder.h
@@ -53,7 +53,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
/// variables `in' and `in_size' to be defined.
#define rc_to_local(range_decoder, in_pos) \
lzma_range_decoder rc = range_decoder; \
- size_t rc_in_pos = (in_pos); \
+ size_t rc_in_pos = in_pos - in_size; \
+ const uint8_t *restrict rc_end = in + in_size; \
uint32_t rc_bound
@@ -61,7 +62,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
#define rc_from_local(range_decoder, in_pos) \
do { \
range_decoder = rc; \
- in_pos = rc_in_pos; \
+ in_pos = rc_in_pos + in_size; \
} while (0)
@@ -87,12 +88,13 @@ do { \
#define rc_normalize(seq) \
do { \
if (rc.range < RC_TOP_VALUE) { \
- if (unlikely(rc_in_pos == in_size)) { \
+ if (unlikely(!rc_in_pos)) { \
coder->sequence = seq; \
goto out; \
} \
+ rc.code <<= RC_SHIFT_BITS; \
rc.range <<= RC_SHIFT_BITS; \
- rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
+ rc.code |= rc_end[rc_in_pos++]; \
} \
} while (0)
@@ -151,11 +153,79 @@ do { \
/// Decodes one bit, updates "symbol", and runs action0 or action1 depending
/// on the decoded bit.
+#ifndef NO_BRANCH_OPT
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
+do { \
+ rc_normalize(seq); \
+ uint32_t cache = (prob), tmp; \
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
+ rc.range -= rc_bound; \
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
+ __asm__ ( \
+ "cmpl %3, %2\n\t" \
+ "cmovbl %3, %1\n\t" \
+ "sbbl %0, %0" \
+ : "=&r"(tmp), "+&r"(rc.range) \
+ : "r"(rc.code), "r"(rc_bound) \
+ : "flags" \
+ ); \
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
+ prob -= cache >> RC_MOVE_BITS; \
+ pre0; tmp = ~tmp; pre1; \
+ rc.code -= tmp & rc_bound; \
+ if (!tmp) { action0; } else { action1; }; \
+} while (0)
+#elif defined(__GNUC__) && defined(__aarch64__)
+#define rc_bit_nobranch(prob, pre0, pre1, action0, action1, seq) \
+do { \
+ rc_normalize(seq); \
+ uint32_t cache = (prob), tmp; \
+ rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * cache; \
+ rc.range -= rc_bound; \
+ /* tmp = rc.code < rc_bound ? rc.range = rc_bound, ~0 : 0; */ \
+ __asm__ ( \
+ "cmp %w2, %w3\n\t" \
+ "csel %w1, %w3, %w1, lo\n\t" \
+ "csetm %w0, lo" \
+ : "=&r"(tmp), "+&r"(rc.range) \
+ : "r"(rc.code), "r"(rc_bound) \
+ : "cc" \
+ ); \
+ cache += tmp & ((1 << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL); \
+ prob -= cache >> RC_MOVE_BITS; \
+ pre0; tmp = ~tmp; pre1; \
+ rc.code -= tmp & rc_bound; \
+ if (!tmp) { action0; } else { action1; }; \
+} while (0)
+#endif
+#endif
+#ifdef rc_bit_nobranch
+#define rc_bit(prob, action0, action1, seq) \
+ rc_bit_nobranch(prob, , symbol = (symbol << 1) - tmp, \
+ action0, action1, seq)
+#define rc_bit_matched(prob, seq) \
+ rc_bit_nobranch(prob, offset &= match_bit ^ tmp, \
+ symbol = (symbol << 1) - tmp, , , seq)
+#define rc_bit_dist(prob, offset, seq) \
+ rc_bit_nobranch(prob, , \
+ symbol = (symbol << 1) - tmp; rep0 += offset & tmp, \
+ , , seq);
+#define rc_bit_dist_last(prob, offset, seq) \
+ rc_bit_nobranch(prob, , rep0 += offset & tmp, , , seq);
+#else
#define rc_bit(prob, action0, action1, seq) \
rc_bit_last(prob, \
symbol <<= 1; action0, \
symbol = (symbol << 1) + 1; action1, \
seq);
+#define rc_bit_matched(prob, seq) \
+ rc_bit(prob, offset &= ~match_bit, offset &= match_bit, seq)
+#define rc_bit_dist(prob, offset, seq) \
+ rc_bit(prob, , rep0 += offset, seq);
+#define rc_bit_dist_last(prob, offset, seq) \
+ rc_bit_last(prob, , rep0 += offset, seq)
+#endif
/// Like rc_bit() but add "case seq:" as a prefix. This makes the unrolled
--
2.17.1
@amonakov
Copy link

AMD 4500U (zen 2):
Before:

$ perf stat -r 5 ./src/xzdec/xzdec linux-5.15.tar.xz >/dev/null

 Performance counter stats for './src/xzdec/xzdec linux-5.15.tar.xz' (5 runs):

         11,001.91 msec task-clock:u              #    1.000 CPUs utilized            ( +-  0.06% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
            17,247      page-faults:u             #    1.568 K/sec                    ( +-  0.02% )
    25,839,466,988      cycles:u                  #    2.349 GHz                      ( +-  0.06% )
       570,586,183      stalled-cycles-frontend:u #    2.21% frontend cycles idle     ( +-  0.18% )
     4,603,924,078      stalled-cycles-backend:u  #   17.82% backend cycles idle      ( +-  0.28% )
    33,990,630,407      instructions:u            #    1.32  insn per cycle
                                                  #    0.14  stalled cycles per insn  ( +-  0.00% )
     4,392,930,509      branches:u                #  399.288 M/sec                    ( +-  0.00% )
       448,633,646      branch-misses:u           #   10.21% of all branches          ( +-  0.02% )

          11.00174 +- 0.00639 seconds time elapsed  ( +-  0.06% )

After:

$ perf stat -r 5 ./src/xzdec/xzdec linux-5.15.tar.xz >/dev/null

 Performance counter stats for './src/xzdec/xzdec linux-5.15.tar.xz' (5 runs):

         10,886.97 msec task-clock:u              #    1.000 CPUs utilized            ( +-  0.06% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
            17,250      page-faults:u             #    1.584 K/sec                    ( +-  0.01% )
    25,572,364,776      cycles:u                  #    2.349 GHz                      ( +-  0.06% )
       341,349,770      stalled-cycles-frontend:u #    1.33% frontend cycles idle     ( +-  0.08% )
     9,514,632,432      stalled-cycles-backend:u  #   37.21% backend cycles idle      ( +-  0.16% )
    36,278,706,843      instructions:u            #    1.42  insn per cycle
                                                  #    0.26  stalled cycles per insn  ( +-  0.00% )
     3,759,133,527      branches:u                #  345.287 M/sec                    ( +-  0.00% )
       310,622,266      branch-misses:u           #    8.26% of all branches          ( +-  0.02% )

          10.88696 +- 0.00634 seconds time elapsed  ( +-  0.06% )

@ilyakurdyukov
Copy link
Author

For this archive, yes, the difference is subtle.

Maybe it's because this hack does not optimize the rc_bit() macro from SEQ_LITERAL_MATCHED, it can be heavily used for text data (I guess this archive contains mostly text). But I can try to fix that.

On my Skylake:

perf stat -r 5 build/src/xz/.libs/xz -c -d linux-5.15.7.tar.xz > /dev/null

Before:

      7 493,13 msec task-clock:u              #    1,000 CPUs utilized            ( +-  0,16% )
             0      context-switches:u        #    0,000 K/sec                  
             0      cpu-migrations:u          #    0,000 K/sec                  
        16 469      page-faults:u             #    0,002 M/sec                    ( +-  0,00% )
27 964 283 939      cycles:u                  #    3,732 GHz                      ( +-  0,15% )
33 129 609 542      instructions:u            #    1,18  insn per cycle           ( +-  0,00% )
 4 221 678 998      branches:u                #  563,407 M/sec                    ( +-  0,00% )
   490 799 588      branch-misses:u           #   11,63% of all branches          ( +-  0,02% )

        7,4940 +- 0,0121 seconds time elapsed  ( +-  0,16% )

After:

     7 221,74 msec task-clock:u              #    1,000 CPUs utilized            ( +-  0,29% )
             0      context-switches:u        #    0,000 K/sec                  
             0      cpu-migrations:u          #    0,000 K/sec                  
        16 469      page-faults:u             #    0,002 M/sec                    ( +-  0,00% )
26 928 558 378      cycles:u                  #    3,729 GHz                      ( +-  0,26% )
35 371 885 714      instructions:u            #    1,31  insn per cycle           ( +-  0,00% )
 3 572 561 046      branches:u                #  494,695 M/sec                    ( +-  0,00% )
   331 262 037      branch-misses:u           #    9,27% of all branches          ( +-  0,03% )

        7,2223 +- 0,0206 seconds time elapsed  ( +-  0,29% )

@ilyakurdyukov
Copy link
Author

When I compress Firefox from my system: tar -cf lib.tar /usr/lib/firefox (which totals in 226549760 bytes), using -7e --format=lzma, then the difference is bigger (+11%):

Before:

      3 285,02 msec task-clock:u              #    1,000 CPUs utilized            ( +-  0,09% )
             0      context-switches:u        #    0,000 K/sec                  
             0      cpu-migrations:u          #    0,000 K/sec                  
         4 181      page-faults:u             #    0,001 M/sec                    ( +-  0,01% )
12 341 894 334      cycles:u                  #    3,757 GHz                      ( +-  0,09% )
13 928 815 327      instructions:u            #    1,13  insn per cycle           ( +-  0,00% )
 1 781 400 452      branches:u                #  542,281 M/sec                    ( +-  0,00% )
   236 255 408      branch-misses:u           #   13,26% of all branches          ( +-  0,02% )

       3,28599 +- 0,00279 seconds time elapsed  ( +-  0,08% )

After:

      2 961,76 msec task-clock:u              #    1,000 CPUs utilized            ( +-  0,03% )
             0      context-switches:u        #    0,000 K/sec                  
             0      cpu-migrations:u          #    0,000 K/sec                  
         4 181      page-faults:u             #    0,001 M/sec                    ( +-  0,01% )
11 116 968 963      cycles:u                  #    3,754 GHz                      ( +-  0,03% )
15 027 237 347      instructions:u            #    1,35  insn per cycle           ( +-  0,00% )
 1 474 337 222      branches:u                #  497,791 M/sec                    ( +-  0,00% )
   151 726 579      branch-misses:u           #   10,29% of all branches          ( +-  0,03% )

       2,96238 +- 0,00109 seconds time elapsed  ( +-  0,04% )

But anyway, I need to fix SEQ_LITERAL_MATCHED, which make the patch bigger, but should make the optimization impact more stable.

@ilyakurdyukov
Copy link
Author

I have updated the patch, which now includes SEQ_LITERAL_MATCHED, but the impact on Linux kernel sources hasn't changed.

However, the decompression speed is increased for binary data: 35% -> 39% for the Python3 build directory and 11% -> 15% for Firefox binaries and data.

@ilyakurdyukov
Copy link
Author

I made an update that speeds up decompression by 10% for these Linux kernel sources.

@cinquemb
Copy link

I wonder how it compares with this: http://mattmahoney.net/dc/text.html

@haarp
Copy link

haarp commented Dec 13, 2021

Please try to get this patch upstream. There's a lot of energy that could be saved world-wide with more efficient (de)compression :)

@amonakov
Copy link

New 4500U (Zen 2) stats:

 Performance counter stats for './src/xzdec/xzdec linux-5.15.tar.xz' (3 runs):

         10,150.53 msec task-clock:u              #    1.000 CPUs utilized            ( +-  0.29% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
            17,255      page-faults:u             #    1.700 K/sec                    ( +-  0.01% )
    23,828,253,375      cycles:u                  #    2.347 GHz                      ( +-  0.29% )
       236,171,644      stalled-cycles-frontend:u #    0.99% frontend cycles idle     ( +-  0.45% )
    10,985,580,685      stalled-cycles-backend:u  #   46.10% backend cycles idle      ( +-  0.65% )
    37,691,902,062      instructions:u            #    1.58  insn per cycle
                                                  #    0.29  stalled cycles per insn  ( +-  0.00% )
     3,505,241,976      branches:u                #  345.326 M/sec                    ( +-  0.00% )
       207,417,196      branch-misses:u           #    5.92% of all branches          ( +-  0.01% )

           10.1500 +- 0.0299 seconds time elapsed  ( +-  0.29% )

@amonakov
Copy link

On Intel i5-2500 (Sandybridge):

Before:

 Performance counter stats for 'src/xzdec/xzdec linux-5.15.tar.xz' (5 runs):

          8,954.71 msec task-clock:u              #    0.998 CPUs utilized            ( +-  0.04% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
             1,368      page-faults:u             #  152.724 /sec                     ( +-  0.26% )
    29,347,257,196      cycles:u                  #    3.277 GHz                      ( +-  0.01% )
    12,337,648,629      stalled-cycles-frontend:u #   42.04% frontend cycles idle     ( +-  0.05% )
     9,029,544,206      stalled-cycles-backend:u  #   30.77% backend cycles idle      ( +-  0.05% )
    33,590,809,450      instructions:u            #    1.14  insn per cycle
                                                  #    0.37  stalled cycles per insn  ( +-  0.00% )
     4,390,208,316      branches:u                #  490.268 M/sec                    ( +-  0.00% )
       477,470,512      branch-misses:u           #   10.88% of all branches          ( +-  0.02% )

           8.97213 +- 0.00262 seconds time elapsed  ( +-  0.03% )

After:

 Performance counter stats for 'src/xzdec/xzdec linux-5.15.tar.xz' (5 runs):

          8,519.22 msec task-clock:u              #    0.994 CPUs utilized            ( +-  0.44% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
             1,358      page-faults:u             #  159.381 /sec                     ( +-  0.11% )
    28,032,300,841      cycles:u                  #    3.290 GHz                      ( +-  0.04% )
    13,667,678,436      stalled-cycles-frontend:u #   48.76% frontend cycles idle     ( +-  0.08% )
     8,181,854,786      stalled-cycles-backend:u  #   29.19% backend cycles idle      ( +-  0.10% )
    37,315,478,332      instructions:u            #    1.33  insn per cycle
                                                  #    0.37  stalled cycles per insn  ( +-  0.00% )
     3,502,520,091      branches:u                #  411.132 M/sec                    ( +-  0.00% )
       224,562,347      branch-misses:u           #    6.41% of all branches          ( +-  0.08% )

           8.57411 +- 0.00504 seconds time elapsed  ( +-  0.06% )

@ilyakurdyukov
Copy link
Author

It looks like the results I got on a home computer with a Skylake processor are the best.

Intel Atom: 2-3% speedup
Sandybridge: 5% speedup
Zen 2: 8% speedup

At least there's no negative results yet.
But this is for decompressing text data, there is usually more speedup for binary data.

@amonakov
Copy link

I got closer to 9% on Ivybridge (successor of Sandybridge uarch) on the linux-5.15.tar.xz archive; don't have perf on that machine at the moment, unfortunately.

On Atom I guess the misprediction penalty is a bit smaller, and, further, the significant increase in instruction count is eating some of the benefit of reduced mispredictions?

I don't mind running additional tests on binary data, we just need to select some common archive to benchmark on (something that anyone could retrieve). How about the 175MiB linux-firmware archive at https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz ?

@ilyakurdyukov
Copy link
Author

@amonakov, yes it is a good archive for testing binary data decompression performance.

export LD_LIBRARY_PATH=build/src/liblzma/.libs
perf stat -r 5 build/src/xz/.libs/xz -c -d linux-firmware-20211027.tar.xz > /dev/null

So, on Skylake before:

      8 173,04 msec task-clock:u              #    1,000 CPUs utilized            ( +-  0,03% )
             0      context-switches:u        #    0,000 K/sec                  
             0      cpu-migrations:u          #    0,000 K/sec                  
        16 467      page-faults:u             #    0,002 M/sec                    ( +-  0,00% )
30 598 206 585      cycles:u                  #    3,744 GHz                      ( +-  0,02% )
34 721 982 623      instructions:u            #    1,13  insn per cycle           ( +-  0,00% )
 4 194 995 085      branches:u                #  513,272 M/sec                    ( +-  0,00% )
   556 648 934      branch-misses:u           #   13,27% of all branches          ( +-  0,00% )

       8,17348 +- 0,00210 seconds time elapsed  ( +-  0,03% )

And after:

      6 347,72 msec task-clock:u              #    1,000 CPUs utilized            ( +-  0,02% )
             0      context-switches:u        #    0,000 K/sec                  
             0      cpu-migrations:u          #    0,000 K/sec                  
        16 468      page-faults:u             #    0,003 M/sec                    ( +-  0,00% )
23 685 264 720      cycles:u                  #    3,731 GHz                      ( +-  0,02% )
39 467 984 353      instructions:u            #    1,67  insn per cycle           ( +-  0,00% )
 2 941 144 012      branches:u                #  463,338 M/sec                    ( +-  0,00% )
   165 593 141      branch-misses:u           #    5,63% of all branches          ( +-  0,09% )

      6,348644 +- 0,000952 seconds time elapsed  ( +-  0,01% )

Speedup by almost 29%.

@ilyakurdyukov
Copy link
Author

Intel Celeron 1007U (Ivy Bridge)

linux-5.15.7.tar.xz : 19.33 --> 18.07 (+7%)
linux-firmware-20211027.tar.xz : 20.57 --> 17.76 (+16%)

@amonakov
Copy link

Ryzen 4500U (Zen 2)

Before:

 Performance counter stats for 'src/xzdec/xzdec ../linux-firmware-20211027.tar.xz' (3 runs):

         11,907.43 msec task-clock:u              #    1.000 CPUs utilized            ( +-  0.03% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
            17,240      page-faults:u             #    1.448 K/sec                    ( +-  0.02% )
    27,992,949,586      cycles:u                  #    2.351 GHz                      ( +-  0.04% )
       583,797,051      stalled-cycles-frontend:u #    2.09% frontend cycles idle     ( +-  0.06% )
     3,412,760,275      stalled-cycles-backend:u  #   12.19% backend cycles idle      ( +-  0.36% )
    35,183,073,831      instructions:u            #    1.26  insn per cycle
                                                  #    0.10  stalled cycles per insn  ( +-  0.00% )
     4,182,899,643      branches:u                #  351.285 M/sec                    ( +-  0.00% )
       531,756,997      branch-misses:u           #   12.71% of all branches          ( +-  0.01% )

          11.90746 +- 0.00389 seconds time elapsed  ( +-  0.03% )

After:

 Performance counter stats for 'src/xzdec/xzdec ../linux-firmware-20211027.tar.xz' (3 runs):

          9,797.98 msec task-clock:u              #    1.000 CPUs utilized            ( +-  0.11% )
                 0      context-switches:u        #    0.000 /sec
                 0      cpu-migrations:u          #    0.000 /sec
            17,247      page-faults:u             #    1.760 K/sec                    ( +-  0.02% )
    23,012,154,561      cycles:u                  #    2.349 GHz                      ( +-  0.02% )
       173,975,497      stalled-cycles-frontend:u #    0.76% frontend cycles idle     ( +-  0.09% )
    12,818,655,196      stalled-cycles-backend:u  #   55.70% backend cycles idle      ( +-  0.02% )
    39,198,156,956      instructions:u            #    1.70  insn per cycle
                                                  #    0.33  stalled cycles per insn  ( +-  0.00% )
     2,970,714,992      branches:u                #  303.197 M/sec                    ( +-  0.00% )
       156,064,253      branch-misses:u           #    5.25% of all branches          ( +-  0.01% )

            9.7976 +- 0.0107 seconds time elapsed  ( +-  0.11% )

@ilyakurdyukov
Copy link
Author

Did a minor update to cover the rc_bit() macro, which is used when XZ is configured with the --enable-small option.

@ilyakurdyukov
Copy link
Author

... and I did the same for AArch64, using CSEL instruction. (patch updated)

But I only have Allwinner H616 (Cortex-A53) for testing:

linux-5.15.7.tar.xz : 25.12 --> 23.85 (+5%)
linux-firmware-20211027.tar.xz : 22.80 --> 21.10 (+8%)

@astei
Copy link

astei commented Dec 15, 2021

AArch64, AWS Graviton2 (c6g.xlarge size):

Before:

 Performance counter stats for './src/xzdec/xzdec ./linux-firmware-20211027.tar.xz' (5 runs):

           8751.09 msec task-clock                #    1.000 CPUs utilized            ( +-  0.02% )
                44      context-switches          #    0.005 K/sec                    ( +-  5.47% )
                 0      cpu-migrations            #    0.000 K/sec                    ( +- 61.24% )
             18112      page-faults               #    0.002 M/sec                    ( +-  0.05% )
       21832211556      cycles                    #    2.495 GHz                      ( +-  0.02% )
       30273778750      instructions              #    1.39  insn per cycle           ( +-  0.01% )
   <not supported>      branches                                                    
         535339600      branch-misses                                                 ( +-  0.01% )

           8.75277 +- 0.00198 seconds time elapsed  ( +-  0.02% )

After:

 Performance counter stats for './src/xzdec/xzdec ./linux-firmware-20211027.tar.xz' (5 runs):

           8111.05 msec task-clock                #    1.000 CPUs utilized            ( +-  0.01% )
                48      context-switches          #    0.006 K/sec                    ( +-  2.54% )
                 1      cpu-migrations            #    0.000 K/sec                    ( +- 31.62% )
             18088      page-faults               #    0.002 M/sec                    ( +-  0.01% )
       20232553634      cycles                    #    2.494 GHz                      ( +-  0.01% )
       34157801215      instructions              #    1.69  insn per cycle           ( +-  0.00% )
   <not supported>      branches                                                    
         159725908      branch-misses                                                 ( +-  0.02% )

          8.112706 +- 0.000784 seconds time elapsed  ( +-  0.01% )

~7.9% speedup.

@ilyakurdyukov
Copy link
Author

Results from ALT Linux team:

Comet Lake:

linux-5.15.8.tar.xz : 5.81 --> 5.30 (+9.6%)
linux-firmware-20211027.tar.xz : 6.14 --> 5.31 (+15.6%)

Intel Xeon (Ivy Bridge EP):

linux-5.15.8.tar.xz : 7.987+-0.363 --> 7.128+-0.288 ( 10.75+-5.80 % )
linux-firmware-20211027.tar.xz : 8.552+-0.354 --> 7.059+-0.310 ( 17.46+-5.51 % )

@ilyakurdyukov
Copy link
Author

I wrote scripts to make testing easier:

# downloads test files and builds XZ
./build.sh
# tests decompression speed
./bench.sh ref
./bench.sh new

build.sh

#!/bin/bash
set -e

# patch
[ -f faster_lzma_decoder_x86.patch ] || wget https://gist.githubusercontent.com/ilyakurdyukov/f514418f3affd677e1ac408ec0c09188/raw/faster_lzma_decoder_x86.patch
# test data
[ -f linux-5.15.7.tar.xz ] || wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.7.tar.xz
[ -f linux-firmware-20211027.tar.xz ] || wget https://mirrors.edge.kernel.org/pub/linux/kernel/firmware/linux-firmware-20211027.tar.xz

[ -d xz ] || git clone http://git.tukaani.org/xz.git
mkdir -p ref
mkdir -p new

# build ref

cp -r xz build
cd build
# or cmake as an option
./autogen.sh --no-po4a
./configure
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../ref/

# build new

patch -p1 < ../faster_lzma_decoder_x86.patch
make
cp -L src/liblzma/.libs/liblzma.so.5 src/xz/.libs/xz ../new/
cd ..

bench.sh

#!/bin/bash

dir=${1:-"ref"}
export LD_LIBRARY_PATH=$(pwd)/$dir
# to make sure the correct liblzma is loaded 
ldd $dir/xz | grep liblzma

# or "perf stat -r 5" instead of "for" and "time -p"

for i in 1 2 3; do
	echo "# ($dir:$i) linux-5.15.7"
	time -p $dir/xz -c -d linux-5.15.7.tar.xz > /dev/null
	echo "# ($dir:$i) linux-firmware-20211027"
	time -p $dir/xz -c -d linux-firmware-20211027.tar.xz > /dev/null
done

@ilyakurdyukov
Copy link
Author

Kunpeng-920 (AArch64)

linux-5.15.7.tar.xz : 8.82 --> 8.78 (+0.4%)
linux-firmware-20211027.tar.xz : 8.97 --> 8.44 (+6.2%)

@ilyakurdyukov
Copy link
Author

I got a report that this patch is slowing decoding on the Apple M1, but I am unable to investigate this on my own.

Well, at least it will be easy to exclude it for Apple M1 by adding some #ifndef.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment