Skip to content

Instantly share code, notes, and snippets.

@animetosho
Last active July 17, 2023 22:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save animetosho/3161ca019751cc0bd70d49c8a4e0d754 to your computer and use it in GitHub Desktop.
Save animetosho/3161ca019751cc0bd70d49c8a4e0d754 to your computer and use it in GitHub Desktop.
Quick'n'dirty patch to enable x86/ARM AES acceleration (plus Blake2s/RS16 x86 SIMD) in GCC/Clang for unrar 6.2.6
# apply with `patch -s -p1 < unrar-gcc.patch` in unrar 6.2.6 source directory
--- a/blake2s.cpp
+++ b/blake2s.cpp
@@ -18,7 +18,7 @@
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
};
-static const byte blake2s_sigma[10][16] =
+const byte blake2s_sigma[10][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
--- a/blake2s_sse.cpp
+++ b/blake2s_sse.cpp
@@ -10,6 +10,7 @@
static __m128i crotr8, crotr16;
#endif
+TARGET_FUNC("sse2")
static void blake2s_init_sse()
{
// We cannot initialize these 128 bit variables in place when declaring
@@ -100,7 +101,11 @@
UNDIAGONALIZE(row[0],row[1],row[2],row[3]); \
}
-
+#ifdef _WIN_64
+TARGET_FUNC("ssse3")
+#else
+TARGET_FUNC("sse2")
+#endif
static int blake2s_compress_sse( blake2s_state *S, const byte block[BLAKE2S_BLOCKBYTES] )
{
__m128i row[4];
--- a/rar.hpp
+++ b/rar.hpp
@@ -1,6 +1,43 @@
#ifndef _RAR_RARCOMMON_
#define _RAR_RARCOMMON_
+#ifdef __GNUC__
+#define TARGET_FUNC(t) __attribute__((target(t)))
+#else
+#define TARGET_FUNC(t)
+#endif
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
+#define USE_SSE
+#define SSE_ALIGNMENT 16
+#include <x86intrin.h>
+#define __cpuid(ar, eax) __asm__("cpuid\n" : "=a"(ar[0]), "=b"(ar[1]), "=c"(ar[2]), "=d"(ar[3]) : "a"(eax) : )
+// needed for blake2s_sse
+#if defined(__x86_64__) || defined(__amd64__)
+#define _WIN_64
+#else
+#define _WIN_32
+#endif
+#endif
+#if defined(__aarch64__) || defined(__armv7__) || defined(__arm__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 6)
+#ifdef __has_include
+#if __has_include(<sys/auxv.h>) && __has_include(<arm_neon.h>)
+#define USE_NEON
+#include <sys/auxv.h>
+#ifdef __FreeBSD__
+static unsigned long getauxval(unsigned long cap) {
+ unsigned long ret;
+ elf_aux_info(cap, &ret, sizeof(ret));
+ return ret;
+}
+#endif
+#if __has_include(<asm/hwcap.h>)
+#include <asm/hwcap.h>
+#endif
+#include <arm_neon.h>
+#endif
+#endif
+#endif
+
#include "raros.hpp"
#include "rartypes.hpp"
#include "os.hpp"
--- a/rijndael.hpp
+++ b/rijndael.hpp
@@ -13,17 +13,26 @@
{
private:
#ifdef USE_SSE
+ TARGET_FUNC("aes")
void blockEncryptSSE(const byte *input,size_t numBlocks,byte *outBuffer);
+ TARGET_FUNC("aes")
void blockDecryptSSE(const byte *input, size_t numBlocks, byte *outBuffer);
bool AES_NI;
#endif
#ifdef USE_NEON
// Set "crypto" attribute as replacement of -march=armv8-a+crypto switch.
- __attribute__((target("crypto")))
+#ifdef __clang__
+ TARGET_FUNC("crypto")
void blockEncryptNeon(const byte *input,size_t numBlocks,byte *outBuffer);
- __attribute__((target("crypto")))
+ TARGET_FUNC("crypto")
void blockDecryptNeon(const byte *input, size_t numBlocks, byte *outBuffer);
+#else
+ TARGET_FUNC("arch=armv8-a+crypto")
+ void blockEncryptNeon(const byte *input,size_t numBlocks,byte *outBuffer);
+ TARGET_FUNC("arch=armv8-a+crypto")
+ void blockDecryptNeon(const byte *input, size_t numBlocks, byte *outBuffer);
+#endif
bool AES_Neon;
#endif
--- a/rs16.hpp
+++ b/rs16.hpp
@@ -17,6 +17,7 @@
void InvertDecoderMatrix();
#ifdef USE_SSE
+ TARGET_FUNC("ssse3")
bool SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize);
#endif
@sanderjo
Copy link

I get:

sander@zwart2204:~/unrar-6.2.6/unrar$ patch -s -p1 < unrar-gcc.patch
patch: **** malformed patch at line 68:  #include "os.hpp"
sander@zwart2204:~/unrar-6.2.6/unrar$ wget https://gist.githubusercontent.com/animetosho/3161ca019751cc0bd70d49c8a4e0d754/raw/4930e76e9025935df011b86a10e9fd0db049db02/unrar-gcc.patch

sander@zwart2204:~/unrar-6.2.6/unrar$ md5sum *patch
16d16c9afb6eee4a898bc12d680c7284  unrar-gcc.patch

sander@zwart2204:~/unrar-6.2.6/unrar$ wc -l *patch
109 unrar-gcc.patch

@animetosho
Copy link
Author

Oops, could you try this again?
Thanks!

@sanderjo
Copy link

Works!

rm -rf unrar
tar xvzf unrarsrc-6.2.6.tar.gz 
cd unrar/
wget https://gist.githubusercontent.com/animetosho/3161ca019751cc0bd70d49c8a4e0d754/raw/a7ac79dbf45f2bbe0231a81c929affc6301cf426/unrar-gcc.patch
md5sum unrar-gcc.patch 
head *patch
patch -s -p1 < unrar-gcc.patch
make
./unrar 

@sanderjo
Copy link

sanderjo commented Jul 17, 2023

How can I see the new unrar has Intel SIMD?

new unrar:

sander@zwart2204:~/unrar-6.2.6/unrar$ objdump -d ./unrar | awk '/[ \t](addps|addss|andnps|andps|cmpps|cmpss|comiss|cvtpi2ps|cvtps2pi|cvtsi2ss|cvtss2s|cvttps2pi|cvttss2si|divps|divss|ldmxcsr|maxps|maxss|minps|minss|movaps|movhlps|movhps|movlhps|movlps|movmskps|movntps|movss|movups|mulps|mulss|orps|rcpps|rcpss|rsqrtps|rsqrtss|shufps|sqrtps|sqrtss|stmxcsr|subps|subss|ucomiss|unpckhps|unpcklps|xorps|pavgb|pavgw|pextrw|pinsrw|pmaxsw|pmaxub|pminsw|pminub|pmovmskb|psadbw|pshufw)[ \t]/' | wc -l
354

looks good (right?), but the old unrar has SIMD too, and even more lines?

sander@zwart2204:~/unrar-6.2.6/unrar$ objdump -d /usr/bin/unrar | awk '/[ \t](addps|addss|andnps|andps|cmpps|cmpss|comiss|cvtpi2ps|cvtps2pi|cvtsi2ss|cvtss2s|cvttps2pi|cvttss2si|divps|divss|ldmxcsr|maxps|maxss|minps|minss|movaps|movhlps|movhps|movlhps|movlps|movmskps|movntps|movss|movups|mulps|mulss|orps|rcpps|rcpss|rsqrtps|rsqrtss|shufps|sqrtps|sqrtss|stmxcsr|subps|subss|ucomiss|unpckhps|unpcklps|xorps|pavgb|pavgw|pextrw|pinsrw|pmaxsw|pmaxub|pminsw|pminub|pmovmskb|psadbw|pshufw)[ \t]/' | wc -l
1279

@animetosho
Copy link
Author

You can try to grep for the aesenc instruction.
Though more ideally, remove the strip command in the makefile, rebuild, and run perf record ./unrar ..., then perf report and check the profile. The profile should show what's consuming CPU.

@sanderjo
Copy link

with the SIMD unrar:

sudo perf record ../unrar x  postfile.part01.rar

sudo perf report

which says:

Samples: 7K of event 'cycles', Event count (approx.): 6942836427
Overhead  Command  Shared Object         Symbol
  15,71%  unrar    unrar                 [.] 0x0000000000013b66
   9,72%  unrar    [kernel.kallsyms]     [k] copy_user_enhanced_fast_string
   4,17%  unrar    unrar                 [.] 0x0000000000013b45
   4,12%  unrar    unrar                 [.] 0x0000000000013b85
   4,10%  unrar    unrar                 [.] 0x0000000000013b8d
   4,10%  unrar    unrar                 [.] 0x0000000000013b94
   3,99%  unrar    unrar                 [.] 0x0000000000013b39
   3,97%  unrar    unrar                 [.] 0x0000000000013b57
   3,89%  unrar    unrar                 [.] 0x0000000000013ba8
   3,79%  unrar    unrar                 [.] 0x0000000000013b7a
   3,67%  unrar    unrar                 [.] 0x0000000000013b78
   3,33%  unrar    unrar                 [.] 0x0000000000013ba3
   2,94%  unrar    [kernel.kallsyms]     [k] __filemap_add_folio
   1,27%  unrar    [kernel.kallsyms]     [k] clear_page_erms
   1,22%  unrar    [kernel.kallsyms]     [k] rmqueue_bulk
   0,98%  unrar    [kernel.kallsyms]     [k] mem_cgroup_css_rstat_flush
   0,84%  unrar    [kernel.kallsyms]     [k] consume_stock
   0,79%  unrar    [kernel.kallsyms]     [k] blkcg_rstat_flush
   0,74%  unrar    [kernel.kallsyms]     [k] xas_load
   0,66%  unrar    [kernel.kallsyms]     [k] __pagevec_lru_add_fn
   0,54%  unrar    [kernel.kallsyms]     [k] get_mem_cgroup_from_mm
   0,51%  unrar    [kernel.kallsyms]     [k] ext4_fill_raw_inode
   0,46%  unrar    [kernel.kallsyms]     [k] fault_in_readable
   0,46%  unrar    [kernel.kallsyms]     [k] try_charge_memcg
   0,46%  unrar    [kernel.kallsyms]     [k] __mem_cgroup_charge
   0,46%  unrar    [kernel.kallsyms]     [k] kmem_cache_alloc

plus

Cannot load tips.txt file, please install perf!

@sanderjo
Copy link

with plain old unrar:

Samples: 8K of event 'cycles', Event count (approx.): 7529120598
Overhead  Command  Shared Object      Symbol
  17,71%  unrar    unrar              [.] 0x00000000000119f8
   9,37%  unrar    [kernel.kallsyms]  [k] copy_user_enhanced_fast_string
   5,73%  unrar    unrar              [.] 0x0000000000011a0e
   4,05%  unrar    unrar              [.] 0x0000000000011a3e
   3,80%  unrar    unrar              [.] 0x0000000000011a18
   3,78%  unrar    unrar              [.] 0x0000000000011a2f
   3,65%  unrar    unrar              [.] 0x00000000000119ff
   3,60%  unrar    unrar              [.] 0x0000000000011a4a
   3,49%  unrar    unrar              [.] 0x0000000000011a5d
   3,27%  unrar    unrar              [.] 0x00000000000119f5
   3,24%  unrar    unrar              [.] 0x00000000000119f2
   3,10%  unrar    unrar              [.] 0x0000000000011a48
   3,04%  unrar    unrar              [.] 0x0000000000011a25
   2,30%  unrar    [kernel.kallsyms]  [k] __filemap_add_folio
   1,01%  unrar    [kernel.kallsyms]  [k] clear_page_erms
   0,88%  unrar    [kernel.kallsyms]  [k] rmqueue_bulk
   0,87%  unrar    [kernel.kallsyms]  [k] mem_cgroup_css_rstat_flush
   0,73%  unrar    [kernel.kallsyms]  [k] xas_load
   0,66%  unrar    [kernel.kallsyms]  [k] blkcg_rstat_flush
   0,62%  unrar    [kernel.kallsyms]  [k] get_mem_cgroup_from_mm
   0,61%  unrar    [kernel.kallsyms]  [k] __pagevec_lru_add_fn
   0,58%  unrar    [kernel.kallsyms]  [k] consume_stock
   0,51%  unrar    [kernel.kallsyms]  [k] try_charge_memcg
   0,46%  unrar    [kernel.kallsyms]  [k] _raw_spin_lock
   0,45%  unrar    [kernel.kallsyms]  [k] fault_in_readable
   0,42%  unrar    [kernel.kallsyms]  [k] filemap_get_read_batch
   0,42%  unrar    [kernel.kallsyms]  [k] __mem_cgroup_charge
   0,42%  unrar    [kernel.kallsyms]  [k] ext4_fill_raw_inode

@sanderjo
Copy link

SIMD-unrar:

$ time ../unrar x postfile.part01.rar

real	0m2,410s
user	0m0,980s
sys	0m0,717s

old unrar:

$ time /usr/bin/unrar x postfile.part01.rar

real	0m2,651s
user	0m1,082s
sys	0m0,759s

So the SIMD-unrar needs about 10% less processing time?

@animetosho
Copy link
Author

You'll want to remove the $(STRIP) unrar line in the makefile and rebuild, before doing the perf profile, otherwise the function names get stripped (you'll want to do this for both versions of unrar).

I've made some fixes to the patch though (I'm messing this up a bit :/), so try doing a build again with the new patch.
Thanks for testing so far!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment