Skip to content

Instantly share code, notes, and snippets.

@Adenilson
Created March 1, 2024 03:46
Show Gist options
  • Save Adenilson/564473bf1b8ee9622d2f45869bd2dace to your computer and use it in GitHub Desktop.
Save Adenilson/564473bf1b8ee9622d2f45869bd2dace to your computer and use it in GitHub Desktop.
First draft for adding SIMD optimizations in CentOS zlib package: starting with CRC-32
diff --git a/zlib-1.2.11-x86-64-SIMD-crc32.patch b/zlib-1.2.11-x86-64-SIMD-crc32.patch
new file mode 100644
index 0000000..f2d7935
--- /dev/null
+++ b/zlib-1.2.11-x86-64-SIMD-crc32.patch
@@ -0,0 +1,827 @@
+From 0a1a7bf6c7ba29f7096447fd5bb4e36407e760d8 Mon Sep 17 00:00:00 2001
+From: Adenilson Cavalcanti <cavalcantii@chromium.org>
+Date: Thu, 29 Feb 2024 17:52:35 -0800
+Subject: [PATCH] Add a SIMD optimized CRC-32, should help considerable for
+ decompression cases (and a bit on compression too).
+
+---
+ Makefile.in | 8 +
+ configure | 24 ++
+ contrib/chromium/crc32_simd.c | 628 ++++++++++++++++++++++++++++++++++
+ contrib/chromium/crc32_simd.h | 57 +++
+ crc32.c | 18 +
+ 5 files changed, 735 insertions(+)
+ create mode 100644 contrib/chromium/crc32_simd.c
+ create mode 100644 contrib/chromium/crc32_simd.h
+
+diff --git a/Makefile.in b/Makefile.in
+index 63f76da..e6f3c3c 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -183,6 +183,9 @@ crc32_z_power8.o: $(SRCDIR)contrib/power/crc32_z_power8.c
+ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-vx.c
+ $(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32-vx.c
+
++crc32_simd.o: $(SRCDIR)contrib/chromium/crc32_simd.c
++ $(CC) $(CFLAGS) $(ZINC) -msse4.2 -mpclmul -c -o $@ $(SRCDIR)contrib/chromium/crc32_simd.c
++
+ deflate.o: $(SRCDIR)deflate.c
+ $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
+
+@@ -243,6 +246,11 @@ crc32_z_power8.lo: $(SRCDIR)contrib/power/crc32_z_power8.c
+ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/crc32_z_power8.o $(SRCDIR)contrib/power/crc32_z_power8.c
+ -@mv objs/crc32_z_power8.o $@
+
++crc32_simd.lo: $(SRCDIR)contrib/chromium/crc32_simd.c
++ -@mkdir objs 2>/dev/null || test -d objs
++ $(CC) $(SFLAGS) -msse4.2 -mpclmul $(ZINC) -DPIC -c -o objs/crc32_simd.o $(SRCDIR)contrib/chromium/crc32_simd.c
++ -@mv objs/crc32_simd.o $@
++
+ deflate.lo: $(SRCDIR)deflate.c
+ -@mkdir objs 2>/dev/null || test -d objs
+ $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
+diff --git a/configure b/configure
+index a4606b8..48953ee 100755
+--- a/configure
++++ b/configure
+@@ -115,6 +115,7 @@ case "$1" in
+ echo ' [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
+ echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
+ echo ' [--dfltcc]' | tee -a configure.log
++ echo ' [--simd]' | tee -a configure.log
+ exit 0 ;;
+ -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
+ -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
+@@ -144,6 +145,10 @@ case "$1" in
+ PIC_OBJC="$PIC_OBJC dfltcc.lo"
+ shift
+ ;;
++ --simd)
++ CFLAGS="$CFLAGS -DSIMD"
++ shift
++ ;;
+ *)
+ echo "unknown option: $1" | tee -a configure.log
+ echo "$0 --help for help" | tee -a configure.log
+@@ -941,6 +946,25 @@ else
+ echo "Checking for s390 vector extensions... No." | tee -a configure.log
+ fi
+
++echo >> configure.log
++cat > $test.c <<EOF
++#if defined(__x86_64__)
++ // Enable SIMD for default on x86.
++#else
++ #error "Target doesn't support SIMD optimizations."
++#endif
++EOF
++
++if tryboth $CC -c $CFLAGS $test.c; then
++ CFLAGS="${CFLAGS} -DENABLE_SIMD -DCRC32_SIMD_SSE42_PCLMUL"
++ OBJC="$OBJC crc32_simd.o"
++ PIC_OBJC="$PIC_OBJC crc32_simd.lo"
++ echo "Enabling SIMD support... Yes." | tee -a configure.log
++else
++ echo "Enabling SIMD ... No." | tee -a configure.log
++fi
++
++
+ # show the results in the log
+ echo >> configure.log
+ echo ALL = $ALL >> configure.log
+diff --git a/contrib/chromium/crc32_simd.c b/contrib/chromium/crc32_simd.c
+new file mode 100644
+index 0000000..5670470
+--- /dev/null
++++ b/contrib/chromium/crc32_simd.c
+@@ -0,0 +1,628 @@
++/* crc32_simd.c
++ *
++ * Copyright 2017 The Chromium Authors
++ * Use of this source code is governed by a BSD-style license that can be
++ * found in the Chromium source repository LICENSE file.
++ */
++
++#include "crc32_simd.h"
++// FIXME(cavalcantii): this is available on newer versions of zutil.h.
++#ifdef _MSC_VER
++#define zalign(x) __declspec(align(x))
++#else
++#define zalign(x) __attribute__((aligned((x))))
++#endif
++
++#if defined(CRC32_SIMD_AVX512_PCLMUL)
++
++/*
++ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
++ * length must be at least 256, and a multiple of 64. Based on:
++ *
++ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
++ * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
++ */
++
++#include <emmintrin.h>
++#include <smmintrin.h>
++#include <wmmintrin.h>
++#include <immintrin.h>
++
++uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
++ const unsigned char *buf,
++ z_size_t len,
++ uint32_t crc)
++{
++ /*
++ * Definitions of the bit-reflected domain constants k1,k2,k3,k4
++ * are similar to those given at the end of the paper, and remaining
++ * constants and CRC32+Barrett polynomials remain unchanged.
++ *
++ * Replace the index of x from 128 to 512. As follows:
++ * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
++ * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
++ * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
++ * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
++ */
++ static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
++ 0x011542778a, 0x01322d1430,
++ 0x011542778a, 0x01322d1430,
++ 0x011542778a, 0x01322d1430 };
++ static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
++ 0x0154442bd4, 0x01c6e41596,
++ 0x0154442bd4, 0x01c6e41596,
++ 0x0154442bd4, 0x01c6e41596 };
++ static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
++ static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
++ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
++ __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
++ __m128i a0, a1, a2, a3;
++
++ /*
++ * There's at least one block of 256.
++ */
++ x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
++ x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
++ x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
++ x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
++
++ x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
++
++ x0 = _mm512_load_si512((__m512i *)k1k2);
++
++ buf += 256;
++ len -= 256;
++
++ /*
++ * Parallel fold blocks of 256, if any.
++ */
++ while (len >= 256)
++ {
++ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
++ x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
++ x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
++ x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
++
++
++ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
++ x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
++ x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
++ x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
++
++ y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
++ y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
++ y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
++ y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
++
++ x1 = _mm512_xor_si512(x1, x5);
++ x2 = _mm512_xor_si512(x2, x6);
++ x3 = _mm512_xor_si512(x3, x7);
++ x4 = _mm512_xor_si512(x4, x8);
++
++ x1 = _mm512_xor_si512(x1, y5);
++ x2 = _mm512_xor_si512(x2, y6);
++ x3 = _mm512_xor_si512(x3, y7);
++ x4 = _mm512_xor_si512(x4, y8);
++
++ buf += 256;
++ len -= 256;
++ }
++
++ /*
++ * Fold into 512-bits.
++ */
++ x0 = _mm512_load_si512((__m512i *)k3k4);
++
++ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
++ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
++ x1 = _mm512_xor_si512(x1, x2);
++ x1 = _mm512_xor_si512(x1, x5);
++
++ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
++ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
++ x1 = _mm512_xor_si512(x1, x3);
++ x1 = _mm512_xor_si512(x1, x5);
++
++ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
++ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
++ x1 = _mm512_xor_si512(x1, x4);
++ x1 = _mm512_xor_si512(x1, x5);
++
++ /*
++ * Single fold blocks of 64, if any.
++ */
++ while (len >= 64)
++ {
++ x2 = _mm512_loadu_si512((__m512i *)buf);
++
++ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
++ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
++ x1 = _mm512_xor_si512(x1, x2);
++ x1 = _mm512_xor_si512(x1, x5);
++
++ buf += 64;
++ len -= 64;
++ }
++
++ /*
++ * Fold 512-bits to 384-bits.
++ */
++ a0 = _mm_load_si128((__m128i *)k5k6);
++
++ a1 = _mm512_extracti32x4_epi32(x1, 0);
++ a2 = _mm512_extracti32x4_epi32(x1, 1);
++
++ a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
++ a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
++
++ a1 = _mm_xor_si128(a1, a3);
++ a1 = _mm_xor_si128(a1, a2);
++
++ /*
++ * Fold 384-bits to 256-bits.
++ */
++ a2 = _mm512_extracti32x4_epi32(x1, 2);
++ a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
++ a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
++ a1 = _mm_xor_si128(a1, a3);
++ a1 = _mm_xor_si128(a1, a2);
++
++ /*
++ * Fold 256-bits to 128-bits.
++ */
++ a2 = _mm512_extracti32x4_epi32(x1, 3);
++ a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
++ a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
++ a1 = _mm_xor_si128(a1, a3);
++ a1 = _mm_xor_si128(a1, a2);
++
++ /*
++ * Fold 128-bits to 64-bits.
++ */
++ a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
++ a3 = _mm_setr_epi32(~0, 0, ~0, 0);
++ a1 = _mm_srli_si128(a1, 8);
++ a1 = _mm_xor_si128(a1, a2);
++
++ a0 = _mm_loadl_epi64((__m128i*)k7k8);
++ a2 = _mm_srli_si128(a1, 4);
++ a1 = _mm_and_si128(a1, a3);
++ a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
++ a1 = _mm_xor_si128(a1, a2);
++
++ /*
++ * Barret reduce to 32-bits.
++ */
++ a0 = _mm_load_si128((__m128i*)poly);
++
++ a2 = _mm_and_si128(a1, a3);
++ a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
++ a2 = _mm_and_si128(a2, a3);
++ a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
++ a1 = _mm_xor_si128(a1, a2);
++
++ /*
++ * Return the crc32.
++ */
++ return _mm_extract_epi32(a1, 1);
++}
++
++#elif defined(CRC32_SIMD_SSE42_PCLMUL)
++
++/*
++ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
++ * length must be at least 64, and a multiple of 16.
++ */
++
++#include <emmintrin.h>
++#include <smmintrin.h>
++#include <wmmintrin.h>
++
++uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
++ const unsigned char *buf,
++ z_size_t len,
++ uint32_t crc)
++{
++ /*
++ * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
++ * the CRC32+Barrett polynomials given at the end of the paper.
++ */
++ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
++ static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
++ static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
++ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
++
++ __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
++
++ /*
++ * There's at least one block of 64.
++ */
++ x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
++ x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
++ x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
++ x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
++
++ x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
++
++ x0 = _mm_load_si128((__m128i *)k1k2);
++
++ buf += 64;
++ len -= 64;
++
++ /*
++ * Parallel fold blocks of 64, if any.
++ */
++ while (len >= 64)
++ {
++ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
++ x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
++ x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
++ x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
++
++ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
++ x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
++ x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
++ x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
++
++ y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
++ y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
++ y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
++ y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
++
++ x1 = _mm_xor_si128(x1, x5);
++ x2 = _mm_xor_si128(x2, x6);
++ x3 = _mm_xor_si128(x3, x7);
++ x4 = _mm_xor_si128(x4, x8);
++
++ x1 = _mm_xor_si128(x1, y5);
++ x2 = _mm_xor_si128(x2, y6);
++ x3 = _mm_xor_si128(x3, y7);
++ x4 = _mm_xor_si128(x4, y8);
++
++ buf += 64;
++ len -= 64;
++ }
++
++ /*
++ * Fold into 128-bits.
++ */
++ x0 = _mm_load_si128((__m128i *)k3k4);
++
++ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
++ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
++ x1 = _mm_xor_si128(x1, x2);
++ x1 = _mm_xor_si128(x1, x5);
++
++ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
++ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
++ x1 = _mm_xor_si128(x1, x3);
++ x1 = _mm_xor_si128(x1, x5);
++
++ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
++ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
++ x1 = _mm_xor_si128(x1, x4);
++ x1 = _mm_xor_si128(x1, x5);
++
++ /*
++ * Single fold blocks of 16, if any.
++ */
++ while (len >= 16)
++ {
++ x2 = _mm_loadu_si128((__m128i *)buf);
++
++ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
++ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
++ x1 = _mm_xor_si128(x1, x2);
++ x1 = _mm_xor_si128(x1, x5);
++
++ buf += 16;
++ len -= 16;
++ }
++
++ /*
++ * Fold 128-bits to 64-bits.
++ */
++ x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
++ x3 = _mm_setr_epi32(~0, 0, ~0, 0);
++ x1 = _mm_srli_si128(x1, 8);
++ x1 = _mm_xor_si128(x1, x2);
++
++ x0 = _mm_loadl_epi64((__m128i*)k5k0);
++
++ x2 = _mm_srli_si128(x1, 4);
++ x1 = _mm_and_si128(x1, x3);
++ x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
++ x1 = _mm_xor_si128(x1, x2);
++
++ /*
++ * Barret reduce to 32-bits.
++ */
++ x0 = _mm_load_si128((__m128i*)poly);
++
++ x2 = _mm_and_si128(x1, x3);
++ x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
++ x2 = _mm_and_si128(x2, x3);
++ x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
++ x1 = _mm_xor_si128(x1, x2);
++
++ /*
++ * Return the crc32.
++ */
++ return _mm_extract_epi32(x1, 1);
++}
++
++#elif defined(CRC32_ARMV8_CRC32)
++
++/* CRC32 checksums using ARMv8-a crypto instructions.
++ */
++
++#if defined(__clang__)
++/* We need some extra types for using PMULL.
++ */
++#if defined(__aarch64__)
++#include <arm_neon.h>
++#include <arm_acle.h>
++#endif
++
++/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
++ * armv8 target, which is incompatible with ThinLTO optimizations on Android.
++ * (Namely, mixing and matching different module-level targets makes ThinLTO
++ * warn, and Android defaults to armv7-a. This restriction does not apply to
++ * function-level `target`s, however.)
++ *
++ * Since we only need four crc intrinsics, and since clang's implementation of
++ * those are just wrappers around compiler builtins, it's simplest to #define
++ * those builtins directly. If this #define list grows too much (or we depend on
++ * an intrinsic that isn't a trivial wrapper), we may have to find a better way
++ * to go about this.
++ *
++ * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
++ * feature for this target (ignoring feature)." This appears to be a harmless
++ * bug in clang.
++ *
++ * These definitions must appear *after* including arm_acle.h otherwise that
++ * header may end up defining functions named __builtin_arm_crc32* that call
++ * themselves, creating an infinite loop when the intrinsic is called.
++ */
++/* XXX: Cannot hook into builtins with XCode for arm64. */
++#if !defined(ARMV8_OS_MACOS)
++#define __crc32b __builtin_arm_crc32b
++#define __crc32d __builtin_arm_crc32d
++#define __crc32w __builtin_arm_crc32w
++#define __crc32cw __builtin_arm_crc32cw
++#endif
++
++#if defined(__aarch64__)
++#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
++#else // !defined(__aarch64__)
++#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
++#endif // defined(__aarch64__)
++
++#elif defined(__GNUC__)
++/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
++ * allowed. We can just include arm_acle.h.
++ */
++#include <arm_acle.h>
++#include <arm_neon.h>
++#define TARGET_ARMV8_WITH_CRC
++#else // !defined(__GNUC__) && !defined(_aarch64__)
++#error ARM CRC32 SIMD extensions only supported for Clang and GCC
++#endif
++
++TARGET_ARMV8_WITH_CRC
++uint32_t ZLIB_INTERNAL armv8_crc32_little(
++ const unsigned char *buf,
++ z_size_t len,
++ uint32_t crc)
++{
++ uint32_t c = (uint32_t) ~crc;
++
++ while (len && ((uintptr_t)buf & 7)) {
++ c = __crc32b(c, *buf++);
++ --len;
++ }
++
++ const uint64_t *buf8 = (const uint64_t *)buf;
++
++ while (len >= 64) {
++ c = __crc32d(c, *buf8++);
++ c = __crc32d(c, *buf8++);
++ c = __crc32d(c, *buf8++);
++ c = __crc32d(c, *buf8++);
++
++ c = __crc32d(c, *buf8++);
++ c = __crc32d(c, *buf8++);
++ c = __crc32d(c, *buf8++);
++ c = __crc32d(c, *buf8++);
++ len -= 64;
++ }
++
++ while (len >= 8) {
++ c = __crc32d(c, *buf8++);
++ len -= 8;
++ }
++
++ buf = (const unsigned char *)buf8;
++
++ while (len--) {
++ c = __crc32b(c, *buf++);
++ }
++
++ return ~c;
++}
++
++#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
++
++/*
++ * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
++ * length must be at least 64, and a multiple of 16. Based on:
++ *
++ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
++ * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
++ */
++TARGET_ARMV8_WITH_CRC
++static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
++{
++ uint8x16_t r;
++ __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
++ : "=w" (r) : "w" (a), "w" (b) );
++ return r;
++}
++
++TARGET_ARMV8_WITH_CRC
++static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
++{
++ uint8x16_t r;
++ __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
++ : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
++ return r;
++}
++
++TARGET_ARMV8_WITH_CRC
++static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
++{
++ uint8x16_t r;
++ __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
++ : "=w" (r) : "w" (a), "w" (b) );
++ return r;
++}
++
++TARGET_ARMV8_WITH_CRC
++uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
++ const unsigned char *buf,
++ z_size_t len,
++ uint32_t crc)
++{
++ /*
++ * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
++ * the CRC32+Barrett polynomials given at the end of the paper.
++ */
++ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
++ static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
++ static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
++ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
++
++ uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
++
++ /*
++ * There's at least one block of 64.
++ */
++ x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
++ x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
++ x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
++ x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
++
++ x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
++
++ x0 = vld1q_u64(k1k2);
++
++ buf += 64;
++ len -= 64;
++
++ /*
++ * Parallel fold blocks of 64, if any.
++ */
++ while (len >= 64)
++ {
++ x5 = (uint64x2_t) pmull_lo(x1, x0);
++ x6 = (uint64x2_t) pmull_lo(x2, x0);
++ x7 = (uint64x2_t) pmull_lo(x3, x0);
++ x8 = (uint64x2_t) pmull_lo(x4, x0);
++
++ y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
++ y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
++ y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
++ y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
++
++ x1 = (uint64x2_t) pmull_hi(x1, x0);
++ x2 = (uint64x2_t) pmull_hi(x2, x0);
++ x3 = (uint64x2_t) pmull_hi(x3, x0);
++ x4 = (uint64x2_t) pmull_hi(x4, x0);
++
++ x1 = veorq_u64(x1, x5);
++ x2 = veorq_u64(x2, x6);
++ x3 = veorq_u64(x3, x7);
++ x4 = veorq_u64(x4, x8);
++
++ x1 = veorq_u64(x1, y5);
++ x2 = veorq_u64(x2, y6);
++ x3 = veorq_u64(x3, y7);
++ x4 = veorq_u64(x4, y8);
++
++ buf += 64;
++ len -= 64;
++ }
++
++ /*
++ * Fold into 128-bits.
++ */
++ x0 = vld1q_u64(k3k4);
++
++ x5 = (uint64x2_t) pmull_lo(x1, x0);
++ x1 = (uint64x2_t) pmull_hi(x1, x0);
++ x1 = veorq_u64(x1, x2);
++ x1 = veorq_u64(x1, x5);
++
++ x5 = (uint64x2_t) pmull_lo(x1, x0);
++ x1 = (uint64x2_t) pmull_hi(x1, x0);
++ x1 = veorq_u64(x1, x3);
++ x1 = veorq_u64(x1, x5);
++
++ x5 = (uint64x2_t) pmull_lo(x1, x0);
++ x1 = (uint64x2_t) pmull_hi(x1, x0);
++ x1 = veorq_u64(x1, x4);
++ x1 = veorq_u64(x1, x5);
++
++ /*
++ * Single fold blocks of 16, if any.
++ */
++ while (len >= 16)
++ {
++ x2 = vld1q_u64((const uint64_t *)buf);
++
++ x5 = (uint64x2_t) pmull_lo(x1, x0);
++ x1 = (uint64x2_t) pmull_hi(x1, x0);
++ x1 = veorq_u64(x1, x2);
++ x1 = veorq_u64(x1, x5);
++
++ buf += 16;
++ len -= 16;
++ }
++
++ /*
++ * Fold 128-bits to 64-bits.
++ */
++ static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
++
++ x2 = (uint64x2_t) pmull_01(x1, x0);
++ x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
++ x3 = (uint64x2_t) vld1q_u32(mask);
++ x1 = veorq_u64(x1, x2);
++
++ x0 = vld1q_u64(k5k0);
++
++ x2 = (uint64x2_t) pmull_01(x2, x0);
++ x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
++ x1 = vandq_u64(x1, x3);
++ x1 = (uint64x2_t) pmull_lo(x1, x0);
++ x1 = veorq_u64(x1, x2);
++
++ /*
++ * Barret reduce to 32-bits.
++ */
++ x0 = vld1q_u64(poly);
++
++ x2 = vandq_u64(x1, x3);
++ x2 = (uint64x2_t) pmull_01(x2, x0);
++ x2 = vandq_u64(x2, x3);
++ x2 = (uint64x2_t) pmull_lo(x2, x0);
++ x1 = veorq_u64(x1, x2);
++
++ /*
++ * Return the crc32.
++ */
++ return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
++}
++#endif /* aarch64 specific code. */
++
++#endif
+diff --git a/contrib/chromium/crc32_simd.h b/contrib/chromium/crc32_simd.h
+new file mode 100644
+index 0000000..fbd3157
+--- /dev/null
++++ b/contrib/chromium/crc32_simd.h
+@@ -0,0 +1,57 @@
++/* crc32_simd.h
++ *
++ * Copyright 2017 The Chromium Authors
++ * Use of this source code is governed by a BSD-style license that can be
++ * found in the Chromium source repository LICENSE file.
++ */
++
++#include <stdint.h>
++
++#include "../../zconf.h"
++#include "../../zutil.h"
++#include "../../deflate.h"
++
++/*
++ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
++ * length must be at least 64, and a multiple of 16.
++ */
++uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
++ z_size_t len,
++ uint32_t crc);
++
++uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
++ z_size_t len,
++ uint32_t crc);
++
++/*
++ * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
++ * for computing the crc32 of an arbitrary length buffer.
++ */
++#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
++#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
++#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
++#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
++
++/*
++ * CRC32 checksums using ARMv8-a crypto instructions.
++ */
++uint32_t ZLIB_INTERNAL armv8_crc32_little(const unsigned char* buf,
++ z_size_t len,
++ uint32_t crc);
++
++/* aarch64 specific code. */
++#if defined(__aarch64__)
++
++/* 128 is the sweet spot at the time of coding (late 2020). */
++#define Z_CRC32_PMULL_MINIMUM_LENGTH 128
++#define Z_CRC32_PMULL_CHUNKSIZE_MASK 15
++
++/*
++ * CRC32 checksums using ARMv8-a PMULL instructions, where the buffer
++ * length must be at least 64, and a multiple of 16.
++ */
++uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(const unsigned char* buf,
++ z_size_t len,
++ uint32_t crc);
++
++#endif
+diff --git a/crc32.c b/crc32.c
+index 379fac3..5d42a53 100644
+--- a/crc32.c
++++ b/crc32.c
+@@ -30,6 +30,10 @@
+
+ #include "zutil.h" /* for STDC and FAR definitions */
+
++#if defined(ENABLE_SIMD)
++#include "contrib/chromium/crc32_simd.h"
++#endif
++
+ /* Definitions for doing the crc four data bytes at a time. */
+ #if !defined(NOBYFOUR) && defined(Z_U4)
+ # define BYFOUR
+@@ -213,6 +217,20 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
+ {
+ if (buf == Z_NULL) return 0UL;
+
++#if defined(ENABLE_SIMD)
++ if (len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
++ /* crc32 16-byte chunks */
++ z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
++ crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc);
++ /* check remaining data */
++ len -= chunk_size;
++ if (!len)
++ return crc;
++ /* Fall into the default crc32 for the remaining data. */
++ buf += chunk_size;
++ }
++#endif
++
+ #ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+--
+2.39.3
+
diff --git a/zlib.spec b/zlib.spec
index 66c23fe..f0ba6a5 100644
--- a/zlib.spec
+++ b/zlib.spec
@@ -8,7 +8,7 @@ Summary: Compression and decompression library
License: zlib and Boost
URL: https://www.zlib.net/
-Source: https://www.zlib.net/zlib-%{version}.tar.xz
+Source: https://www.zlib.net/fossils/zlib-%{version}.tar.xz
# https://github.com/madler/zlib/pull/210
Patch0: zlib-1.2.5-minizip-fixuncrypt.patch
# resolves: #805113
@@ -67,6 +67,9 @@ Patch29: zlib-1.2.11-IBM-Z-hw-accelrated-deflate-fix-crash-deflateBound.patch
# Upstream patch: https://github.com/madler/zlib/commit/73331a6a0481067628f065ffe87bb1d8f787d10c
Patch30: zlib-1.2.13-Reject-overflows-of-zip-header-fields-in-minizip.patch
+# Add a SIMD implementation for CRC-32 (enable only x86-64 for now)
+Patch31: zlib-1.2.11-x86-64-SIMD-crc32.patch
+
BuildRequires: make
BuildRequires: automake, autoconf, libtool
@@ -145,6 +148,7 @@ developing applications which use minizip.
%patch28 -p1
%patch29 -p1
%patch30 -p1
+%patch31 -p1
iconv -f iso-8859-2 -t utf-8 < ChangeLog > ChangeLog.tmp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment