-
-
Save animetosho/3161ca019751cc0bd70d49c8a4e0d754 to your computer and use it in GitHub Desktop.
# apply with `patch -s -p1 < unrar-gcc.patch` in unrar 6.2.6 source directory | |
--- a/blake2s.cpp | |
+++ b/blake2s.cpp | |
@@ -18,7 +18,7 @@ | |
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL | |
}; | |
-static const byte blake2s_sigma[10][16] = | |
+const byte blake2s_sigma[10][16] = | |
{ | |
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , | |
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , | |
--- a/blake2s_sse.cpp | |
+++ b/blake2s_sse.cpp | |
@@ -10,6 +10,7 @@ | |
static __m128i crotr8, crotr16; | |
#endif | |
+TARGET_FUNC("sse2") | |
static void blake2s_init_sse() | |
{ | |
// We cannot initialize these 128 bit variables in place when declaring | |
@@ -100,7 +101,11 @@ | |
UNDIAGONALIZE(row[0],row[1],row[2],row[3]); \ | |
} | |
- | |
+#ifdef _WIN_64 | |
+TARGET_FUNC("ssse3") | |
+#else | |
+TARGET_FUNC("sse2") | |
+#endif | |
static int blake2s_compress_sse( blake2s_state *S, const byte block[BLAKE2S_BLOCKBYTES] ) | |
{ | |
__m128i row[4]; | |
--- a/rar.hpp | |
+++ b/rar.hpp | |
@@ -1,6 +1,43 @@ | |
#ifndef _RAR_RARCOMMON_ | |
#define _RAR_RARCOMMON_ | |
+#ifdef __GNUC__ | |
+#define TARGET_FUNC(t) __attribute__((target(t))) | |
+#else | |
+#define TARGET_FUNC(t) | |
+#endif | |
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) | |
+#define USE_SSE | |
+#define SSE_ALIGNMENT 16 | |
+#include <x86intrin.h> | |
+#define __cpuid(ar, eax) __asm__("cpuid\n" : "=a"(ar[0]), "=b"(ar[1]), "=c"(ar[2]), "=d"(ar[3]) : "a"(eax) : ) | |
+// needed for blake2s_sse | |
+#if defined(__x86_64__) || defined(__amd64__) | |
+#define _WIN_64 | |
+#else | |
+#define _WIN_32 | |
+#endif | |
+#endif | |
+#if defined(__aarch64__) || defined(__armv7__) || defined(__arm__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 6) | |
+#ifdef __has_include | |
+#if __has_include(<sys/auxv.h>) && __has_include(<arm_neon.h>) | |
+#define USE_NEON | |
+#include <sys/auxv.h> | |
+#ifdef __FreeBSD__ | |
+static unsigned long getauxval(unsigned long cap) { | |
+ unsigned long ret; | |
+ elf_aux_info(cap, &ret, sizeof(ret)); | |
+ return ret; | |
+} | |
+#endif | |
+#if __has_include(<asm/hwcap.h>) | |
+#include <asm/hwcap.h> | |
+#endif | |
+#include <arm_neon.h> | |
+#endif | |
+#endif | |
+#endif | |
+ | |
#include "raros.hpp" | |
#include "rartypes.hpp" | |
#include "os.hpp" | |
--- a/rijndael.hpp | |
+++ b/rijndael.hpp | |
@@ -13,17 +13,26 @@ | |
{ | |
private: | |
#ifdef USE_SSE | |
+ TARGET_FUNC("aes") | |
void blockEncryptSSE(const byte *input,size_t numBlocks,byte *outBuffer); | |
+ TARGET_FUNC("aes") | |
void blockDecryptSSE(const byte *input, size_t numBlocks, byte *outBuffer); | |
bool AES_NI; | |
#endif | |
#ifdef USE_NEON | |
// Set "crypto" attribute as replacement of -march=armv8-a+crypto switch. | |
- __attribute__((target("crypto"))) | |
+#ifdef __clang__ | |
+ TARGET_FUNC("crypto") | |
void blockEncryptNeon(const byte *input,size_t numBlocks,byte *outBuffer); | |
- __attribute__((target("crypto"))) | |
+ TARGET_FUNC("crypto") | |
void blockDecryptNeon(const byte *input, size_t numBlocks, byte *outBuffer); | |
+#else | |
+ TARGET_FUNC("arch=armv8-a+crypto") | |
+ void blockEncryptNeon(const byte *input,size_t numBlocks,byte *outBuffer); | |
+ TARGET_FUNC("arch=armv8-a+crypto") | |
+ void blockDecryptNeon(const byte *input, size_t numBlocks, byte *outBuffer); | |
+#endif | |
bool AES_Neon; | |
#endif | |
--- a/rs16.hpp | |
+++ b/rs16.hpp | |
@@ -17,6 +17,7 @@ | |
void InvertDecoderMatrix(); | |
#ifdef USE_SSE | |
+ TARGET_FUNC("ssse3") | |
bool SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize); | |
#endif | |
Oops, could you try this again?
Thanks!
Works!
rm -rf unrar
tar xvzf unrarsrc-6.2.6.tar.gz
cd unrar/
wget https://gist.githubusercontent.com/animetosho/3161ca019751cc0bd70d49c8a4e0d754/raw/a7ac79dbf45f2bbe0231a81c929affc6301cf426/unrar-gcc.patch
md5sum unrar-gcc.patch
head *patch
patch -s -p1 < unrar-gcc.patch
make
./unrar
How can I see the new unrar has Intel SIMD?
new unrar:
sander@zwart2204:~/unrar-6.2.6/unrar$ objdump -d ./unrar | awk '/[ \t](addps|addss|andnps|andps|cmpps|cmpss|comiss|cvtpi2ps|cvtps2pi|cvtsi2ss|cvtss2s|cvttps2pi|cvttss2si|divps|divss|ldmxcsr|maxps|maxss|minps|minss|movaps|movhlps|movhps|movlhps|movlps|movmskps|movntps|movss|movups|mulps|mulss|orps|rcpps|rcpss|rsqrtps|rsqrtss|shufps|sqrtps|sqrtss|stmxcsr|subps|subss|ucomiss|unpckhps|unpcklps|xorps|pavgb|pavgw|pextrw|pinsrw|pmaxsw|pmaxub|pminsw|pminub|pmovmskb|psadbw|pshufw)[ \t]/' | wc -l
354
looks good (right?), but the old unrar has SIMD too, and even more lines?
sander@zwart2204:~/unrar-6.2.6/unrar$ objdump -d /usr/bin/unrar | awk '/[ \t](addps|addss|andnps|andps|cmpps|cmpss|comiss|cvtpi2ps|cvtps2pi|cvtsi2ss|cvtss2s|cvttps2pi|cvttss2si|divps|divss|ldmxcsr|maxps|maxss|minps|minss|movaps|movhlps|movhps|movlhps|movlps|movmskps|movntps|movss|movups|mulps|mulss|orps|rcpps|rcpss|rsqrtps|rsqrtss|shufps|sqrtps|sqrtss|stmxcsr|subps|subss|ucomiss|unpckhps|unpcklps|xorps|pavgb|pavgw|pextrw|pinsrw|pmaxsw|pmaxub|pminsw|pminub|pmovmskb|psadbw|pshufw)[ \t]/' | wc -l
1279
You can try to grep for the aesenc
instruction.
Though more ideally, remove the strip
command in the makefile, rebuild, and run perf record ./unrar ...
, then perf report
and check the profile. The profile should show what's consuming CPU.
with the SIMD unrar:
sudo perf record ../unrar x postfile.part01.rar
sudo perf report
which says:
Samples: 7K of event 'cycles', Event count (approx.): 6942836427
Overhead Command Shared Object Symbol
15,71% unrar unrar [.] 0x0000000000013b66
9,72% unrar [kernel.kallsyms] [k] copy_user_enhanced_fast_string
4,17% unrar unrar [.] 0x0000000000013b45
4,12% unrar unrar [.] 0x0000000000013b85
4,10% unrar unrar [.] 0x0000000000013b8d
4,10% unrar unrar [.] 0x0000000000013b94
3,99% unrar unrar [.] 0x0000000000013b39
3,97% unrar unrar [.] 0x0000000000013b57
3,89% unrar unrar [.] 0x0000000000013ba8
3,79% unrar unrar [.] 0x0000000000013b7a
3,67% unrar unrar [.] 0x0000000000013b78
3,33% unrar unrar [.] 0x0000000000013ba3
2,94% unrar [kernel.kallsyms] [k] __filemap_add_folio
1,27% unrar [kernel.kallsyms] [k] clear_page_erms
1,22% unrar [kernel.kallsyms] [k] rmqueue_bulk
0,98% unrar [kernel.kallsyms] [k] mem_cgroup_css_rstat_flush
0,84% unrar [kernel.kallsyms] [k] consume_stock
0,79% unrar [kernel.kallsyms] [k] blkcg_rstat_flush
0,74% unrar [kernel.kallsyms] [k] xas_load
0,66% unrar [kernel.kallsyms] [k] __pagevec_lru_add_fn
0,54% unrar [kernel.kallsyms] [k] get_mem_cgroup_from_mm
0,51% unrar [kernel.kallsyms] [k] ext4_fill_raw_inode
0,46% unrar [kernel.kallsyms] [k] fault_in_readable
0,46% unrar [kernel.kallsyms] [k] try_charge_memcg
0,46% unrar [kernel.kallsyms] [k] __mem_cgroup_charge
0,46% unrar [kernel.kallsyms] [k] kmem_cache_alloc
plus
Cannot load tips.txt file, please install perf!
with plain old unrar:
Samples: 8K of event 'cycles', Event count (approx.): 7529120598
Overhead Command Shared Object Symbol
17,71% unrar unrar [.] 0x00000000000119f8
9,37% unrar [kernel.kallsyms] [k] copy_user_enhanced_fast_string
5,73% unrar unrar [.] 0x0000000000011a0e
4,05% unrar unrar [.] 0x0000000000011a3e
3,80% unrar unrar [.] 0x0000000000011a18
3,78% unrar unrar [.] 0x0000000000011a2f
3,65% unrar unrar [.] 0x00000000000119ff
3,60% unrar unrar [.] 0x0000000000011a4a
3,49% unrar unrar [.] 0x0000000000011a5d
3,27% unrar unrar [.] 0x00000000000119f5
3,24% unrar unrar [.] 0x00000000000119f2
3,10% unrar unrar [.] 0x0000000000011a48
3,04% unrar unrar [.] 0x0000000000011a25
2,30% unrar [kernel.kallsyms] [k] __filemap_add_folio
1,01% unrar [kernel.kallsyms] [k] clear_page_erms
0,88% unrar [kernel.kallsyms] [k] rmqueue_bulk
0,87% unrar [kernel.kallsyms] [k] mem_cgroup_css_rstat_flush
0,73% unrar [kernel.kallsyms] [k] xas_load
0,66% unrar [kernel.kallsyms] [k] blkcg_rstat_flush
0,62% unrar [kernel.kallsyms] [k] get_mem_cgroup_from_mm
0,61% unrar [kernel.kallsyms] [k] __pagevec_lru_add_fn
0,58% unrar [kernel.kallsyms] [k] consume_stock
0,51% unrar [kernel.kallsyms] [k] try_charge_memcg
0,46% unrar [kernel.kallsyms] [k] _raw_spin_lock
0,45% unrar [kernel.kallsyms] [k] fault_in_readable
0,42% unrar [kernel.kallsyms] [k] filemap_get_read_batch
0,42% unrar [kernel.kallsyms] [k] __mem_cgroup_charge
0,42% unrar [kernel.kallsyms] [k] ext4_fill_raw_inode
SIMD-unrar:
$ time ../unrar x postfile.part01.rar
real 0m2,410s
user 0m0,980s
sys 0m0,717s
old unrar:
$ time /usr/bin/unrar x postfile.part01.rar
real 0m2,651s
user 0m1,082s
sys 0m0,759s
So the SIMD-unrar needs about 10% less processing time?
You'll want to remove the $(STRIP) unrar
line in the makefile and rebuild, before doing the perf profile, otherwise the function names get stripped (you'll want to do this for both versions of unrar).
I've made some fixes to the patch though (I'm messing this up a bit :/), so try doing a build again with the new patch.
Thanks for testing so far!
I get: