Skip to content

Instantly share code, notes, and snippets.

@mmozeiko
Last active February 25, 2024 08:08
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mmozeiko/e66f6d23e101b1b9c37cb3d9d10727f5 to your computer and use it in GitHub Desktop.
Save mmozeiko/e66f6d23e101b1b9c37cb3d9d10727f5 to your computer and use it in GitHub Desktop.
uncompressed png writer & reader
#pragma once
// uncompressed png writer & reader
// supports only 8-bit and 16-bit formats
// Performance comparison for 8192x8192 BGRA8 image (256MB)
// Compiled with "clang -O2", AVX2 requires extra "-mavx2" or "/arch:AVX2" argument
//
// For libpng (compressed) uses default libpng/zlib compression settings
// For libpng (uncompressed) case following two functions are used:
// png_set_compression_level() with Z_NO_COMPRESSION
// png_set_filter() with PNG_FILTER_NONE
//
// Ryzen 5950x
// upng (AVX2) = 22.9 msec (11157.3 MB/s), read = 20.5 msec (12499.4 MB/s)
// upng = 27.7 msec ( 9254.6 MB/s), read = 20.8 msec (12296.8 MB/s)
// libpng (uncompressed) = 169.9 msec ( 1506.9 MB/s), read = 167.5 msec ( 1528.6 MB/s)
// libpng (compressed) = 2148.1 msec ( 119.2 MB/s), read = 503.5 msec ( 508.4 MB/s)
//
// Raspberry Pi4 (-march=armv8+crc)
// upng = 182.9 msec (1399.7 MB/s), read = 110.8 msec (2310.8 MB/s)
// libpng (uncompressed) = 1192.7 msec ( 214.6 MB/s), read = 1211.8 msec ( 211.3 MB/s)
// libpng (compressed) = 9396.8 msec ( 27.2 MB/s), read = 1874.6 msec ( 136.6 MB/s)
//
// Apple M1 (-march=armv8+crc+crypto)
// upng = 22.2 msec (11523.7 MB/s), read = 8.9 msec (28622.5 MB/s)
// libpng (uncompressed) = 93.3 msec ( 2743.3 MB/s), read = 66.6 msec ( 3841.8 MB/s)
// libpng (compressed) = 2038.6 msec ( 125.6 MB/s), read = 90.4 msec ( 2832.5 MB/s)
#include <stddef.h>
#include <stdint.h>
typedef enum {
UPNG_FORMAT_G8,
UPNG_FORMAT_GA8,
UPNG_FORMAT_RGB8,
UPNG_FORMAT_BGR8,
UPNG_FORMAT_RGBA8,
UPNG_FORMAT_BGRA8,
UPNG_FORMAT_G16,
UPNG_FORMAT_GA16,
UPNG_FORMAT_RGB16,
UPNG_FORMAT_BGR16,
UPNG_FORMAT_RGBA16,
UPNG_FORMAT_BGRA16,
} upng_format;
typedef enum {
UPNG_FILTER_NONE = 0,
UPNG_FILTER_UP = 2,
} upng_filter;
// if `dst` is NULL then function will quickly return size needed for `dst` (`src` won't be used)
// if `pitch` is 0, then pixels in `src` are tightly packed without any padding bytes between rows
// returns 0 for unsupported parameter values
static size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter);
// output BGR/BGRA format instead of RGB/RGBA
#define UPNG_READ_SWAP_TO_BGR 1
// if `dst` is NULL then function will quickly return `width` / `height` / `format` values from png header
// if `pitch` is 0, then pixels in `dst` will be tightly packed without any padding bytes between rows
// returns total size of image - `pitch` multiplied by `height`
// returns 0 if png file cannot be successfully parsed or is unsupported
// function does NOT verify CRC32 or ADLER32 checksums
static size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags);
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// optional defines:
// UPNG_DISABLE_AVX2 - do not use AVX2 codepath, even if AVX2 is allowed by compiler
// UPNG_USE_ARM64_PMUL - prefer to use ARM64 PMUL instruction instead of CRC32 on non-Apple targets
// this may be slower than using CRC32 instruction, depends on CPU
#if defined(_M_AMD64) || defined(__x86_64__)
# define UPNG__ARCH_X64
#elif defined(_M_ARM64) || defined(__aarch64__)
# define UPNG__ARCH_ARM64
#endif
#if defined(UPNG__ARCH_X64)
# if defined(__clang__) || defined(__GNUC__)
# include <cpuid.h>
# define UPNG__CPUID(num, regs) __cpuid(num, regs[0], regs[1], regs[2], regs[3])
# define UPNG__CPUID2(num, sub, regs) __cpuid_count(num, sub, regs[0], regs[1], regs[2], regs[3])
# define UPNG__TARGET(str) __attribute__((target(str)))
# else
# include <intrin.h>
# define UPNG__CPUID(num, regs) __cpuid(regs, num)
# define UPNG__CPUID2(num, sub, regs) __cpuidex(regs, num, sub)
# define UPNG__TARGET(str)
# endif
# if defined(__AVX2__) && !defined(UPNG_DISABLE_AVX2)
# define UPNG__ARCH_X64_AVX2
# include <immintrin.h>
# if !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER > 1930 && _MSC_VER < 1936)
// broken MSVC versions that do not generate VEX encoded VPCLMULQDQ instruction
// see https://developercommunity.visualstudio.com/t/_mm_clmulepi64_si128-intrinsic-no-longer/10277103
# pragma message("WARNING: this MSVC compiler version produces very bad performance with AVX2 codegen!")
# undef UPNG__ARCH_X64_AVX2
# elif !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER == 1938)
// broken MSVC version that generate AVX512 instructions in AVX2 code
// see https://developercommunity.visualstudio.com/t/Invalid-AVX512-instructions-generated-wh/10521872
# pragma message("WARNING: this MSVC compiler version produces invalid instructions with AVX2 codegen!")
# undef UPNG__ARCH_X64_AVX2
# endif
# endif
# include <wmmintrin.h> // CLMUL // _mm_clmulepi64_si128
# include <smmintrin.h> // SSSE4.1 // _mm_extract_epi32
# include <tmmintrin.h> // SSSE3 // _mm_maddubs_epi16, _mm_hadd_epi32, _mm_shuffle_epi8
# include <emmintrin.h> // SSE2
#elif defined(UPNG__ARCH_ARM64)
# include <arm_neon.h>
# if __ARM_FEATURE_CRC32 // use -march=armv8-a+crc when possible
# define UPNG__ARM64_CRC32 // __crc32d, __crc32b
# include <arm_acle.h>
# endif
# if __ARM_FEATURE_CRYPTO // use -march=armv8-a+crypto when possible
# if defined(__APPLE__) || defined(UPNG_USE_ARM64_PMUL) || !defined(UPNG__ARM64_CRC32)
# define UPNG__ARM64_CRYPTO // vmull_p64, vmull_high_p64
# endif
# endif
#endif
#if defined(_MSC_VER) && !defined(__clang__)
# include <intrin.h>
# define UPNG__ALIGN(n, var) __declspec(align(n)) var
# define UPNG__MSVC_BARRIER() _ReadWriteBarrier()
# define UPNG__ASSUME_ALIGNED(ptr, align) (ptr)
#else
# define UPNG__ALIGN(n, var) var __attribute__((aligned(n)))
# define UPNG__MSVC_BARRIER() // not need for non-MSVC compiler
# define UPNG__ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned(ptr, align)
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#define UPNG__ADLER32_INIT 1U
#define UPNG__ADLER32_MOD 65521
// max amount of bytes to not overflow "b" as uint32_t
// max "b" value = 255*n*(n+1)/2 + (n+1)*(65521-1)
#define UPNG__ADLER32_CHUNK_SIZE 5552
// max amount of 16-byte blocks to use for SIMD
#define UPNG__ADLER32_BLOCKS1 (UPNG__ADLER32_CHUNK_SIZE / 16)
#define UPNG__ADLER32_BLOCKS3 (UPNG__ADLER32_CHUNK_SIZE / 48)
#define UPNG__ADLER32_BLOCKS4 (UPNG__ADLER32_CHUNK_SIZE / 64)
static uint32_t upng__adler32(uint32_t adler, const void* ptr, size_t size)
{
const uint8_t* bytes = (const uint8_t*)ptr;
uint32_t a = adler & 0xffff;
uint32_t b = (adler >> 16);
// no SIMD here, it'll be used either for small chunk sizes only or without SIMD
while (size >= UPNG__ADLER32_CHUNK_SIZE)
{
for (size_t k = 0; k < UPNG__ADLER32_CHUNK_SIZE; k++)
{
a += *bytes++;
b += a;
}
size -= UPNG__ADLER32_CHUNK_SIZE;
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
while (size-- != 0)
{
a += *bytes++;
b += a;
}
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
return a | (b << 16);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#define UPNG__CRC32_INIT 0U
#if !defined(UPNG__CRC32_TABLE_COUNT)
# if defined(UPNG__ARM64_CRC32)
# define UPNG__CRC32_TABLE_COUNT 0 // no need for CRC32 tables if ACLE crc32 instruction can be used
# elif defined(UPNG__ARCH_X64)
# define UPNG__CRC32_TABLE_COUNT 16 // for x64 use 16KB table (half of L1 cache)
# else
# define UPNG__CRC32_TABLE_COUNT 8 // otherwise be safe and use only 8KB, alternatively set to 4 for 4KB table
# endif
#endif
#if UPNG__CRC32_TABLE_COUNT != 0
static uint32_t upng__crc32_table[UPNG__CRC32_TABLE_COUNT][256];
#endif
static void upng__crc32_init(void)
{
#if UPNG__CRC32_TABLE_COUNT != 0
static int init = 0;
if (!init)
{
const uint32_t CRC32_POLY = 0xedb88320;
for (size_t i = 0; i < 256; i++)
{
uint32_t crc = (uint32_t)i;
for (size_t j = 0; j < 8; j++)
{
crc = (crc >> 1) ^ (crc & 1 ? CRC32_POLY : 0);
}
upng__crc32_table[0][i] = crc;
}
for (size_t i = 1; i < UPNG__CRC32_TABLE_COUNT; i++)
{
for (size_t j = 0; j < 256; j++)
{
upng__crc32_table[i][j] = (upng__crc32_table[i - 1][j] >> 8) ^ upng__crc32_table[0][upng__crc32_table[i - 1][j] & 0xff];
}
}
init = 1;
}
#endif
}
static uint32_t upng__crc32(uint32_t crc, const void* ptr, size_t size)
{
const uint8_t* bytes = (const uint8_t*)ptr;
crc = ~crc;
// no SIMD here, it'll be used either for small chunk sizes only or without SIMD
#if defined(UPNG__ARM64_CRC32)
while (size-- != 0)
{
crc = __crc32b(crc, *bytes++);
}
#else
while ((((uintptr_t)bytes % 4) != 0) && (size != 0))
{
crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++];
size -= 1;
}
// now bytes pointer is 4-byte aligned
const uint32_t* bytes4 = (const uint32_t*)UPNG__ASSUME_ALIGNED(bytes, 4);
#if UPNG__CRC32_TABLE_COUNT == 16
while (size >= 16)
{
uint32_t b0 = *bytes4++ ^ crc;
uint32_t b1 = *bytes4++;
uint32_t b2 = *bytes4++;
uint32_t b3 = *bytes4++;
size -= 16;
// these barriers should not affect anything, but they make MSVC(2022) to generate ~25% faster code
UPNG__MSVC_BARRIER();
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
}
#elif UPNG__CRC32_TABLE_COUNT == 8
while (size >= 8)
{
uint32_t b0 = *bytes4++ ^ crc;
uint32_t b1 = *bytes4++;
size -= 8;
size_t i0 = (b1 >> 24) & 0xff;
size_t i1 = (b1 >> 16) & 0xff;
size_t i2 = (b1 >> 8) & 0xff;
size_t i3 = b1 & 0xff;
size_t i4 = (b0 >> 24) & 0xff;
size_t i5 = (b0 >> 16) & 0xff;
size_t i6 = (b0 >> 8) & 0xff;
size_t i7 = b0 & 0xff;
// similar situation to 16 table count - this make MSVC(2022) to generate ~25% faster code
UPNG__MSVC_BARRIER();
crc = upng__crc32_table[0][i0] ^ upng__crc32_table[4][i4];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[1][i1] ^ upng__crc32_table[5][i5];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[2][i2] ^ upng__crc32_table[6][i6];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[3][i3] ^ upng__crc32_table[7][i7];
}
#elif UPNG__CRC32_TABLE_COUNT == 4
while (size >= 4)
{
uint32_t b0 = *bytes4++ ^ crc;
size -= 4;
crc = upng__crc32_table[0][(b0 >> 24) & 0xff] ^ upng__crc32_table[1][(b0 >> 16) & 0xff] ^ upng__crc32_table[2][(b0 >> 8) & 0xff] ^ upng__crc32_table[3][b0 & 0xff];
}
#endif
bytes = (const uint8_t*)bytes4;
while (size-- != 0)
{
crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++];
}
#endif
return ~crc;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
typedef struct {
uint32_t crc; // crc32 for whole chunk + 4 byte type
uint32_t adler; // adler32 for zlib payload
} upng__idat;
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#if defined(UPNG__ARCH_X64)
#define UPNG__CPUID_SSE41 (1<<1) // SSSE3+SSE4.1
#define UPNG__CPUID_CLMUL (1<<2) // SSSE3+SSE4.1+CLMUL
static int upng__cpuid(void)
{
static int cpuid;
if (!cpuid)
{
int info[4];
UPNG__CPUID(1, info);
int detected = (1 << 0);
if (!!(info[3] & (1 << 9))) // SSSE3 bit
{
if (!!(info[3] & (1 << 19))) // SSE4.1 bit
{
detected |= UPNG__CPUID_SSE41;
if (!!(info[3] & (1 << 1))) // CLMUL bit
{
detected |= UPNG__CPUID_CLMUL;
}
}
}
cpuid = detected;
}
return cpuid;
}
static size_t UPNG__TARGET("ssse3,sse4.1")
upng__row1_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
uint8_t* out = dst;
const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
// adler32
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i ones = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
// on input
// a0 = a
// a = a0 + x[0]
// b += a0 + x[0]
// ...
// a = a0 + x[0] + x[1]
// b += 2*a0 + 2*x[0] + 1*x[1]
// ...
// a = a0 + x[0] + ... + x[N-1]
// b += N*a0 + N*x[0] + (N-1)*x[1] + ... + 2*x[N-2] + 1*x[N-1]
// processing every 16 bytes in an iteration (5552 is multiple of 16)
// va = a0 + (x[0]+...+x[15]) + (x[16]+..+x[31]) + ...
// vb = b + (16*x[0]+...+1*x[15]) + (16*x[16]+...+1*x[31]) + ... + (16*X[N-16]+...1*x[N-1])
// vs = n*a0 + (n-1)*(x[0]+...+x[15]) + (n-2)*(x[16]+...+x[31]) + ... + 1*(x[N-32]+...+x[N-17]) + 0*(x[N-16]+...+x[N-1])
// where n = N/16
// vs*16
// N*a0 + (N-16)*x[0]+...+(N-16)*x[15] + (N-32)*x[16]+...+(N-16)*x[31] + ... + 16*x[N-32]+...+16*x[N-17]
// vb+vs*16
// N*a0 + N*x[0] + (N-1)*x[1] + ... + 16*x[N-16] + 15*x[N-15] + ... + 1*x[N-1]
// for output
// a = va
// b = vb+vs*16
while (size >= 16)
{
__m128i vs = zero;
__m128i va = _mm_cvtsi32_si128(a);
__m128i vb = _mm_cvtsi32_si128(b);
// process as many 16-byte blocks as possible
size_t block_count = size / 16;
block_count = block_count < UPNG__ADLER32_BLOCKS1 ? block_count : UPNG__ADLER32_BLOCKS1;
for (size_t i = 0; i < block_count; i++)
{
// pixel filtering
__m128i vlast = _mm_loadu_si128((const __m128i*)last);
__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle);
_mm_storeu_si128((__m128i*)dst, vdst);
last += inc;
src += 16;
dst += 16;
size -= 16;
// adler32 update
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));
// crc32 update
uint32_t b0 = _mm_extract_epi32(vdst, 0) ^ crc;
uint32_t b1 = _mm_extract_epi32(vdst, 1);
uint32_t b2 = _mm_extract_epi32(vdst, 2);
uint32_t b3 = _mm_extract_epi32(vdst, 3);
UPNG__MSVC_BARRIER();
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
}
// vb += vs * 16
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));
// a = sum(va)
va = _mm_hadd_epi32(va, va);
va = _mm_hadd_epi32(va, va);
a = _mm_cvtsi128_si32(va);
// b = sum(vb)
vb = _mm_hadd_epi32(vb, vb);
vb = _mm_hadd_epi32(vb, vb);
b = _mm_cvtsi128_si32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
idat->adler = a | (b << 16);
idat->crc = ~crc;
return dst - out;
}
#if defined(UPNG__ARCH_X64_AVX2)
static size_t UPNG__TARGET("ssse3,sse4.1,avx2,pclmul")
upng__row1_avx2(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
if (size < 16)
{
return 0;
}
uint8_t* out = dst;
const __m128i shuffle128 = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
// crc32
const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1);
const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0); // low 32 bits
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
// https://web.archive.org/web/20230315165408/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
// calculates m=(1<<n)%P, which is 32-bit value
// returns 33-bit value reflect(m<<32,64) << 1
//
// uint64_t crc32_pow2mod(size_t n)
// {
// uint32_t mod = CRC32_POLY;
// for (size_t i = 0; i < n - 32; i++)
// {
// mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0);
// }
// // bits are already reflected
// return (uint64_t)mod << 1;
// }
// calculates d=(1<<64)/P, which is 33-bit value (65-32=33)
// returns 33-bit value reflect(d,33)
//
// uint64_t crc32_2pow64div(void)
// {
// uint64_t div = 1;
// uint32_t mod = CRC32_POLY;
// for (size_t i = 0; i < 32; i++)
// {
// div |= (mod&1ULL) << (i+1);
// mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0);
// }
// // bits are already reflected
// return div;
// }
// k1 = crc32_pow2mod(4*128+32)
// k2 = crc32_pow2mod(4*128-32
// k3 = crc32_pow2mod(128+32)
// k4 = crc32_pow2mod(128-32)
// k5 = crc32_pow2mod(64)
// P = ((uint64_t)CRC32_POLY << 1) | 1
// u = crc32_2pow64div()
// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0
__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
__m128i x0 = _mm_cvtsi32_si128(crc);
if (size >= 64)
{
const __m256i shuffle256 = _mm256_broadcastsi128_si256(shuffle128);
// adler32
const __m256i cmul = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m256i ones = _mm256_set1_epi16(1);
const __m256i zero = _mm256_setzero_si256();
// crc32
__m128i x1 = _mm_setzero_si128();
__m128i x2 = _mm_setzero_si128();
__m128i x3 = _mm_setzero_si128();
while (size >= 64)
{
__m256i vs = zero;
__m256i va = _mm256_zextsi128_si256(_mm_cvtsi32_si128(a));
__m256i vb = _mm256_zextsi128_si256(_mm_cvtsi32_si128(b));
// process as many 64-byte blocks as possible
size_t block_count = size / 64;
block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;
for (size_t i = 0; i < block_count; i++)
{
// pixel filtering
__m256i vlast0 = _mm256_loadu_si256((const __m256i*)last + 0);
__m256i vlast1 = _mm256_loadu_si256((const __m256i*)last + 1);
__m256i vsrc0 = _mm256_loadu_si256((const __m256i*)src + 0);
__m256i vsrc1 = _mm256_loadu_si256((const __m256i*)src + 1);
__m256i vdst0 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc0, vlast0), shuffle256);
__m256i vdst1 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc1, vlast1), shuffle256);
_mm256_storeu_si256((__m256i*)dst + 0, vdst0);
_mm256_storeu_si256((__m256i*)dst + 1, vdst1);
last += inc * 4;
src += 64;
dst += 64;
size -= 64;
// adler32 update
vs = _mm256_add_epi32(vs, va);
va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst0, zero));
vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst0, cmul), ones));
vs = _mm256_add_epi32(vs, va);
va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst1, zero));
vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst1, cmul), ones));
// crc32 update
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
x0 = _mm_xor_si128(x0, _mm256_castsi256_si128(vdst0));
x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
x1 = _mm_xor_si128(x1, _mm256_extracti128_si256(vdst0, 1));
x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
x2 = _mm_xor_si128(x2, _mm256_castsi256_si128(vdst1));
x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11));
x3 = _mm_xor_si128(x3, _mm256_extracti128_si256(vdst1, 1));
crc_mul = k1k2;
}
vb = _mm256_add_epi32(vb, _mm256_slli_epi32(vs, 5));
// a = sum(va)
__m128i asum = _mm_add_epi32(_mm256_castsi256_si128(va), _mm256_extracti128_si256(va, 1));
asum = _mm_hadd_epi32(asum, asum);
asum = _mm_hadd_epi32(asum, asum);
a = _mm_cvtsi128_si32(asum);
// b = sum(vb)
__m128i bsum = _mm_add_epi32(_mm256_castsi256_si128(vb), _mm256_extracti128_si256(vb, 1));
bsum = _mm_hadd_epi32(bsum, bsum);
bsum = _mm_hadd_epi32(bsum, bsum);
b = _mm_cvtsi128_si32(bsum);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
// reduce 512-bit to 128-bit
x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
crc_mul = k3k4;
}
if (size >= 16)
{
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i ones = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128i vs = zero;
__m128i va = _mm_cvtsi32_si128(a);
__m128i vb = _mm_cvtsi32_si128(b);
// only 1 to 3 iterations
while (size >= 16)
{
__m128i vlast = _mm_loadu_si128((const __m128i*)last);
__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle128);
_mm_storeu_si128((__m128i*)dst, vdst);
last += inc;
src += 16;
dst += 16;
size -= 16;
// adler32 update
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));
// crc32 update
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
x0 = _mm_xor_si128(x0, vdst);
crc_mul = k3k4;
}
// vb += vs * 16
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));
// a = sum(va)
va = _mm_hadd_epi32(va, va);
va = _mm_hadd_epi32(va, va);
a = _mm_cvtsi128_si32(va);
// b = sum(vb)
vb = _mm_hadd_epi32(vb, vb);
vb = _mm_hadd_epi32(vb, vb);
b = _mm_cvtsi128_si32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
idat->adler = a | (b << 16);
// reduce 128-bit to 96-bit
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
// reduce 96-bit to 64-bit
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
// reduce 64-bit to 32-bit
__m128i x1;
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
idat->crc = ~crc;
return dst - out;
}
#else // UPNG__ARCH_X64_AVX2
static size_t UPNG__TARGET("ssse3,sse4.1,pclmul")
upng__row1_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
if (size < 16)
{
return 0;
}
uint8_t* out = dst;
const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
// adler32
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i ones = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
// crc32
const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1);
const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0);
// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0
__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
__m128i x0 = _mm_cvtsi32_si128(crc);
if (size >= 64)
{
__m128i x1 = zero;
__m128i x2 = zero;
__m128i x3 = zero;
while (size >= 64)
{
__m128i vs = zero;
__m128i va = _mm_cvtsi32_si128(a);
__m128i vb = _mm_cvtsi32_si128(b);
// process as many 64-byte blocks as possible
size_t block_count = size / 64;
block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;
for (size_t i = 0; i < block_count; i++)
{
// pixel filtering
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
__m128i vlast3 = _mm_loadu_si128((const __m128i*)last + 3);
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
__m128i vsrc3 = _mm_loadu_si128((const __m128i*)src + 3);
__m128i vdst0 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc0, vlast0), shuffle);
__m128i vdst1 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc1, vlast1), shuffle);
__m128i vdst2 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc2, vlast2), shuffle);
__m128i vdst3 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc3, vlast3), shuffle);
_mm_storeu_si128((__m128i*)dst + 0, vdst0);
_mm_storeu_si128((__m128i*)dst + 1, vdst1);
_mm_storeu_si128((__m128i*)dst + 2, vdst2);
_mm_storeu_si128((__m128i*)dst + 3, vdst3);
last += inc * 4;
src += 64;
dst += 64;
size -= 64;
// adler32 update
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst3, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst3, cmul), ones));
// crc32 update
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
x0 = _mm_xor_si128(x0, vdst0);
x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
x1 = _mm_xor_si128(x1, vdst1);
x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
x2 = _mm_xor_si128(x2, vdst2);
x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11));
x3 = _mm_xor_si128(x3, vdst3);
crc_mul = k1k2;
}
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));
// a = sum(va)
va = _mm_hadd_epi32(va, va);
va = _mm_hadd_epi32(va, va);
a = _mm_cvtsi128_si32(va);
// b = sum(vb)
vb = _mm_hadd_epi32(vb, vb);
vb = _mm_hadd_epi32(vb, vb);
b = _mm_cvtsi128_si32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
// reduce 512-bit to 128-bit
x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
crc_mul = k3k4;
}
if (size >= 16)
{
__m128i vs = zero;
__m128i va = _mm_cvtsi32_si128(a);
__m128i vb = _mm_cvtsi32_si128(b);
// only 1 to 3 iterations
while (size >= 16)
{
__m128i vlast = _mm_loadu_si128((const __m128i*)last);
__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle);
_mm_storeu_si128((__m128i*)dst, vdst);
last += inc;
src += 16;
dst += 16;
size -= 16;
// adler32 update
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));
// crc32 update
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
x0 = _mm_xor_si128(x0, vdst);
crc_mul = k3k4;
}
// vb += vs * 16
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));
// a = sum(va)
va = _mm_hadd_epi32(va, va);
va = _mm_hadd_epi32(va, va);
a = _mm_cvtsi128_si32(va);
// b = sum(vb)
vb = _mm_hadd_epi32(vb, vb);
vb = _mm_hadd_epi32(vb, vb);
b = _mm_cvtsi128_si32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
idat->adler = a | (b << 16);
// reduce 128-bit to 96-bit
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
// reduce 96-bit to 64-bit
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
// reduce 64-bit to 32-bit
__m128i x1;
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
idat->crc = ~crc;
return dst - out;
}
#endif // UPNG__ARCH_X64_AVX2
static size_t UPNG__TARGET("ssse3,sse4.1")
upng__row3_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
uint8_t* out = dst;
const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
// adler32
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i ones = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
while (size >= 48)
{
__m128i vs = zero;
__m128i va = _mm_cvtsi32_si128(a);
__m128i vb = _mm_cvtsi32_si128(b);
size_t block_count = size / 48;
block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;
for (size_t i = 0; i < block_count; i++)
{
// pixel filtering
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
__m128i v0 = _mm_sub_epi8(vsrc0, vlast0);
__m128i v1 = _mm_sub_epi8(vsrc1, vlast1);
__m128i v2 = _mm_sub_epi8(vsrc2, vlast2);
__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01));
__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12));
__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22));
_mm_storeu_si128((__m128i*)dst + 0, vdst0);
_mm_storeu_si128((__m128i*)dst + 1, vdst1);
_mm_storeu_si128((__m128i*)dst + 2, vdst2);
last += inc;
src += 48;
dst += 48;
size -= 48;
// adler32 update
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));
// crc32 update
uint32_t b0 = _mm_extract_epi32(vdst0, 0) ^ crc;
uint32_t b1 = _mm_extract_epi32(vdst0, 1);
uint32_t b2 = _mm_extract_epi32(vdst0, 2);
uint32_t b3 = _mm_extract_epi32(vdst0, 3);
UPNG__MSVC_BARRIER();
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
b0 = _mm_extract_epi32(vdst1, 0) ^ crc;
b1 = _mm_extract_epi32(vdst1, 1);
b2 = _mm_extract_epi32(vdst1, 2);
b3 = _mm_extract_epi32(vdst1, 3);
UPNG__MSVC_BARRIER();
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
b0 = _mm_extract_epi32(vdst2, 0) ^ crc;
b1 = _mm_extract_epi32(vdst2, 1);
b2 = _mm_extract_epi32(vdst2, 2);
b3 = _mm_extract_epi32(vdst2, 3);
UPNG__MSVC_BARRIER();
crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
UPNG__MSVC_BARRIER();
crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
}
// vb += vs * 16
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));
// a = sum(va)
va = _mm_hadd_epi32(va, va);
va = _mm_hadd_epi32(va, va);
a = _mm_cvtsi128_si32(va);
// b = sum(vb)
vb = _mm_hadd_epi32(vb, vb);
vb = _mm_hadd_epi32(vb, vb);
b = _mm_cvtsi128_si32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
idat->adler = a | (b << 16);
idat->crc = ~crc;
return dst - out;
}
static size_t UPNG__TARGET("ssse3,sse4.1,pclmul")
upng__row3_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t (*shuffle)[16])
{
if (size < 48)
{
return 0;
}
uint8_t* out = dst;
const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
// adler32
const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i ones = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
// crc32
// k1 = crc32_pow2mod(3*128+32)
// k2 = crc32_pow2mod(3*128-32)
const __m128i k1k2 = _mm_setr_epi32(0x3db1ecdc, 0, 0x74359406, 1);
const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0);
__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
__m128i x0 = _mm_cvtsi32_si128(crc);
__m128i x1 = zero;
__m128i x2 = zero;
while (size >= 48)
{
__m128i vs = zero;
__m128i va = _mm_cvtsi32_si128(a);
__m128i vb = _mm_cvtsi32_si128(b);
// process as many 3x16-byte blocks as possible
size_t block_count = size / 48;
block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;
for (size_t i = 0; i < block_count; i++)
{
// pixel filtering
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
__m128i v0 = _mm_sub_epi8(vsrc0, vlast0);
__m128i v1 = _mm_sub_epi8(vsrc1, vlast1);
__m128i v2 = _mm_sub_epi8(vsrc2, vlast2);
__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01));
__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12));
__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22));
_mm_storeu_si128((__m128i*)dst + 0, vdst0);
_mm_storeu_si128((__m128i*)dst + 1, vdst1);
_mm_storeu_si128((__m128i*)dst + 2, vdst2);
last += inc;
src += 48;
dst += 48;
size -= 48;
// adler32 update
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
vs = _mm_add_epi32(vs, va);
va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));
// crc32 update
x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
x0 = _mm_xor_si128(x0, vdst0);
x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
x1 = _mm_xor_si128(x1, vdst1);
x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
x2 = _mm_xor_si128(x2, vdst2);
crc_mul = k1k2;
}
// vb += vs * 16
vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));
// a = sum(va)
va = _mm_hadd_epi32(va, va);
va = _mm_hadd_epi32(va, va);
a = _mm_cvtsi128_si32(va);
// b = sum(vb)
vb = _mm_hadd_epi32(vb, vb);
vb = _mm_hadd_epi32(vb, vb);
b = _mm_cvtsi128_si32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
idat->adler = a | (b << 16);
// reduce 384-bit to 128-bit
x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
// reduce 128-bit to 96-bit
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
// reduce 96-bit to 64-bit
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
// reduce 64-bit to 32-bit
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
idat->crc = ~crc;
return dst - out;
}
static size_t UPNG__TARGET("ssse3,sse4.1")
upng__unrow1_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
uint8_t* out = dst;
const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);
while (size >= 16)
{
__m128i vlast = _mm_loadu_si128((const __m128i*)last);
__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
__m128i vdst = _mm_shuffle_epi8(vsrc, shuffle);
_mm_storeu_si128((__m128i*)dst, _mm_add_epi8(vdst, vlast));
last += inc;
src += 16;
dst += 16;
size -= 16;
}
return dst - out;
}
static size_t UPNG__TARGET("ssse3,sse4.1")
upng__unrow3_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
uint8_t* out = dst;
const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));
while (size >= 48)
{
__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(vsrc0, s00), _mm_shuffle_epi8(vsrc1, s01));
__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vsrc0, s10), _mm_shuffle_epi8(vsrc1, s11)), _mm_shuffle_epi8(vsrc2, s12));
__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(vsrc1, s21), _mm_shuffle_epi8(vsrc2, s22));
_mm_storeu_si128((__m128i*)dst + 0, _mm_add_epi8(vdst0, vlast0));
_mm_storeu_si128((__m128i*)dst + 1, _mm_add_epi8(vdst1, vlast1));
_mm_storeu_si128((__m128i*)dst + 2, _mm_add_epi8(vdst2, vlast2));
last += inc;
src += 48;
dst += 48;
size -= 48;
}
return dst - out;
}
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#if defined(UPNG__ARCH_ARM64)
static size_t upng__row1_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
if (size < 16)
{
return 0;
}
uint8_t* out = dst;
const uint64_t shuffle64_high = shuffle64 + 0x0808080808080808;
const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high)));
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708));
const uint32x4_t zero = vdupq_n_u32(0);
#if defined(UPNG__ARM64_CRYPTO)
const poly64x2_t k1k2 = { 0x154442bd4, 0x1c6e41596 };
const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e };
const poly64_t k5 = { 0x163cd6124 };
const poly64_t poly_u = { 0x0f7011641 };
const poly64_t poly_p = { 0x1db710641 };
const uint64x2_t mask32 = { ~0U, 0 };
poly64x2_t crc_mul = { 1, 0 };
poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
#define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)))
#define UPNG__CLMUL_P128(x,k,value) do { \
poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \
poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k); \
x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value)); \
} while (0)
#endif
if (size >= 64)
{
#if defined(UPNG__ARM64_CRYPTO)
poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0)));
poly128_t x2 = x1;
poly128_t x3 = x1;
#endif
while (size >= 64)
{
uint32x4_t va = vsetq_lane_u32(a, zero, 0);
uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
uint32x4_t vs = zero;
// process as many 64-byte blocks as possible
size_t block_count = size / 64;
block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;
for (size_t i = 0; i < block_count; i++)
{
// pixel filtering
uint8x16x4_t vlast = vld1q_u8_x4(last);
uint8x16x4_t vsrc = vld1q_u8_x4(src);
uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]);
uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]);
uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]);
uint8x16_t v3 = vsubq_u8(vsrc.val[3], vlast.val[3]);
uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle);
uint8x16_t vdst1 = vqtbl1q_u8(v1, shuffle);
uint8x16_t vdst2 = vqtbl1q_u8(v2, shuffle);
uint8x16_t vdst3 = vqtbl1q_u8(v3, shuffle);
uint8x16x4_t vdst = { vdst0, vdst1, vdst2, vdst3 };
vst1q_u8_x4(dst, vdst);
last += inc * 4;
src += 64;
dst += 64;
size -= 64;
// these could use vdotq_u32, but it runs ~2% slower
uint16x8_t t0, t1, t2, t3;
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst0));
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst1));
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst2));
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst3));
t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul));
t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul));
t3 = vmull_u8(vget_low_u8(vdst3), vget_low_u8(cmul));
t0 = vmlal_high_u8(t0, vdst0, cmul);
t1 = vmlal_high_u8(t1, vdst1, cmul);
t2 = vmlal_high_u8(t2, vdst2, cmul);
t3 = vmlal_high_u8(t3, vdst3, cmul);
vb = vpadalq_u16(vb, t0);
vb = vpadalq_u16(vb, t1);
vb = vpadalq_u16(vb, t2);
vb = vpadalq_u16(vb, t3);
#if defined(UPNG__ARM64_CRYPTO)
UPNG__CLMUL_P128(x0, crc_mul, vdst0);
UPNG__CLMUL_P128(x1, crc_mul, vdst1);
UPNG__CLMUL_P128(x2, crc_mul, vdst2);
UPNG__CLMUL_P128(x3, crc_mul, vdst3);
crc_mul = k1k2;
#elif defined(UPNG__ARM64_CRC32)
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 1));
#else
uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 0) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
#endif
}
vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
a = vaddvq_u32(va);
b = vaddvq_u32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
#if defined(UPNG__ARM64_CRYPTO)
// reduce 512-bit to 128-bit
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1));
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2));
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x3));
crc_mul = k3k4;
#endif
}
if (size >= 16)
{
uint32x4_t va = vsetq_lane_u32(a, zero, 0);
uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
uint32x4_t vs = zero;
// only 1 to 3 iterations
while (size >= 16)
{
uint8x16_t v0 = vsubq_u8(vld1q_u8(src), vld1q_u8(last));
uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle);
vst1q_u8(dst, vdst0);
last += inc;
src += 16;
dst += 16;
size -= 16;
uint16x8_t t0;
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst0));
t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
t0 = vmlal_high_u8(t0, vdst0, cmul);
vb = vpadalq_u16(vb, t0);
#if defined(UPNG__ARM64_CRYPTO)
UPNG__CLMUL_P128(x0, crc_mul, vdst0);
crc_mul = k3k4;
#elif defined(UPNG__ARM64_CRC32)
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
#else
uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
#endif
}
vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
a = vaddvq_u32(va);
b = vaddvq_u32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
#if defined(UPNG__ARM64_CRYPTO)
// reduce 128-bit to 96-bit
poly128_t p0;
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8));
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1)));
// reduce 96-bit to 64-bit
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4));
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5));
// reduce 64-bit to 32-bit
poly128_t x1;
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u);
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p);
crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1);
#undef UPNG__CLADD_P128
#undef UPNG__CLMUL_P128
#endif
idat->adler = a | (b << 16);
idat->crc = ~crc;
return dst - out;
}
static size_t upng__row3_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
if (size < 48)
{
return 0;
}
uint8_t* out = dst;
const uint8x16_t s00 = vld1q_u8(shuffle[0]);
const uint8x16_t s01 = vld1q_u8(shuffle[1]);
const uint8x16_t s10 = vld1q_u8(shuffle[2]);
const uint8x16_t s11 = vld1q_u8(shuffle[3]);
const uint8x16_t s12 = vld1q_u8(shuffle[4]);
const uint8x16_t s21 = vld1q_u8(shuffle[5]);
const uint8x16_t s22 = vld1q_u8(shuffle[6]);
uint32_t a = idat->adler & 0xffff;
uint32_t b = idat->adler >> 16;
uint32_t crc = ~idat->crc;
const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708));
const uint32x4_t zero = vdupq_n_u32(0);
#if defined(UPNG__ARM64_CRYPTO)
const poly64x2_t k1k2 = { 0x03db1ecdc, 0x174359406 };
const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e };
const poly64_t k5 = { 0x163cd6124 };
const poly64_t poly_u = { 0x0f7011641 };
const poly64_t poly_p = { 0x1db710641 };
const uint64x2_t mask32 = { ~0U, 0 };
poly64x2_t crc_mul = { 1, 0 };
poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0)));
poly128_t x2 = x1;
#define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)))
#define UPNG__CLMUL_P128(x,k,value) do { \
poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \
poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k); \
x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value)); \
} while (0)
#endif
while (size >= 48)
{
uint32x4_t va = vsetq_lane_u32(a, zero, 0);
uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
uint32x4_t vs = zero;
// process as many 3x16-byte blocks as possible
size_t block_count = size / 48;
block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;
for (size_t i = 0; i < block_count; i++)
{
uint8x16x3_t vlast = vld1q_u8_x3(last);
uint8x16x3_t vsrc = vld1q_u8_x3(src);
uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]);
uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]);
uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]);
uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(v0, s00), v1, s01);
uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(v0, s10), v1, s11), v2, s12);
uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(v1, s21), v2, s22);
uint8x16x3_t vdst = { vdst0, vdst1, vdst2 };
vst1q_u8_x3(dst, vdst);
last += inc;
src += 48;
dst += 48;
size -= 48;
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst0));
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst1));
vs = vaddq_u32(vs, va);
va = vpadalq_u16(va, vpaddlq_u8(vdst2));
uint16x8_t t0, t1, t2;
t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul));
t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul));
t0 = vmlal_high_u8(t0, vdst0, cmul);
t1 = vmlal_high_u8(t1, vdst1, cmul);
t2 = vmlal_high_u8(t2, vdst2, cmul);
vb = vpadalq_u16(vb, t0);
vb = vpadalq_u16(vb, t1);
vb = vpadalq_u16(vb, t2);
#if defined(UPNG__ARM64_CRYPTO)
UPNG__CLMUL_P128(x0, crc_mul, vdst0);
UPNG__CLMUL_P128(x1, crc_mul, vdst1);
UPNG__CLMUL_P128(x2, crc_mul, vdst2);
crc_mul = k1k2;
#elif defined(UPNG__ARM64_CRC32)
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0));
crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1));
#else
uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc;
b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3);
crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
#endif
}
vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
a = vaddvq_u32(va);
b = vaddvq_u32(vb);
a %= UPNG__ADLER32_MOD;
b %= UPNG__ADLER32_MOD;
}
#if defined(UPNG__ARM64_CRYPTO)
// reduce 384-bit to 128-bit
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1));
UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2));
// reduce 128-bit to 96-bit
poly128_t p0;
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8));
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1)));
// reduce 96-bit to 64-bit
p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4));
x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5));
// reduce 64-bit to 32-bit
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u);
x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p);
crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1);
#undef UPNG__CLMUL_P128
#undef UPNG__CLADD_P128
#endif
idat->adler = a | (b << 16);
idat->crc = ~crc;
return dst - out;
}
static size_t upng__unrow1_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
uint8_t* out = dst;
uint64_t shuffle64_high = shuffle64 + 0x0808080808080808;
const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high)));
while (size >= 16)
{
uint8x16_t vdst0 = vqtbl1q_u8(vld1q_u8(src), shuffle);
vdst0 = vaddq_u8(vdst0, vld1q_u8(last));
vst1q_u8(dst, vdst0);
last += inc;
src += 16;
dst += 16;
size -= 16;
}
return dst - out;
}
static size_t upng__unrow3_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
uint8_t* out = dst;
const uint8x16_t s00 = vld1q_u8(shuffle[0]);
const uint8x16_t s01 = vld1q_u8(shuffle[1]);
const uint8x16_t s10 = vld1q_u8(shuffle[2]);
const uint8x16_t s11 = vld1q_u8(shuffle[3]);
const uint8x16_t s12 = vld1q_u8(shuffle[4]);
const uint8x16_t s21 = vld1q_u8(shuffle[5]);
const uint8x16_t s22 = vld1q_u8(shuffle[6]);
while (size >= 48)
{
uint8x16x3_t vsrc = vld1q_u8_x3(src);
uint8x16_t vsrc0 = vsrc.val[0];
uint8x16_t vsrc1 = vsrc.val[1];
uint8x16_t vsrc2 = vsrc.val[2];
uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(vsrc0, s00), vsrc1, s01);
uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(vsrc0, s10), vsrc1, s11), vsrc2, s12);
uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(vsrc1, s21), vsrc2, s22);
uint8x16x3_t vlast = vld1q_u8_x3(last);
vdst0 = vaddq_u8(vdst0, vlast.val[0]);
vdst1 = vaddq_u8(vdst1, vlast.val[1]);
vdst2 = vaddq_u8(vdst2, vlast.val[2]);
uint8x16x3_t vdst = { vdst0, vdst1, vdst2 };
vst1q_u8_x3(dst, vdst);
last += inc;
src += 48;
dst += 48;
size -= 48;
}
return dst - out;
}
#endif
#define _ 0xff
#define __ _
// identity shuffle, nothing to rearrange
static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB8[7][16]) =
{
{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
};
// 0123456789012345
// src0 = [BGRBGRBGRBGRBGRB]
// src1 = [GRBGRBGRBGRBGRBG]
// src2 = [RBGRBGRBGRBGRBGR]
static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR8[7][16]) =
{
{ 2,1,0, 5,4,3, 8,7,6, 11,10,9, 14,13,12, _}, // RGB RGB RGB RGB RGB _
{ _,_,_, _,_,_, _,_,_, __,__,_, __,__,__, 1}, // ___ ___ ___ ___ ___ R
{ _,15, _,_,_, _,_,_, __,_,_, __,__,__, _,__}, // _B ___ ___ ___ ___ __
{ 0,__, 4,3,2, 7,6,5, 10,9,8, 13,12,11, _,15}, // G_ RGB RGB RGB RGB _G
{ _,__, _,_,_, _,_,_, __,_,_, __,__,__, 0,__}, // __ ___ ___ ___ ___ R_
{14, _,_,_, _,_,_, _,_,_, __,__,__, __,__,__}, // B ___ ___ ___ ___ ___
{__, 3,2,1, 6,5,4, 9,8,7, 12,11,10, 15,14,13}, // _ RGB RGB RGB RGB RGB
};
// only swap bytes in each 16-bit value
static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB16[7][16]) =
{
{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
};
// 0123456789012345
// src0 = [BBGGRRBBGGRRBBGG]
// src1 = [RRBBGGRRBBGGRRBB]
// src2 = [GGRRBBGGRRBBGGRR]
static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR16[7][16]) =
{
{5,4,3,2,1,0, 11,10,9,8,7,6, _,_,15,14}, // RRGGBB RRGGBB __GG
{_,_,_,_,_,_, __,__,_,_,_,_, 1,0,__,__}, // ______ ______ RR__
{13,12, _,_,_,_,_,_, __,__,__,__,_,_, _,_}, // BB ______ ______ __
{__,__, 7,6,5,4,3,2, 13,12,11,10,9,8, _,_}, // __ RRGGBB RRGGBB __
{__,__, _,_,_,_,_,_, __,__,__,__,_,_, 3,2}, // __ ______ ______ RR
{_,_, 15,14, _,_,_,_,_,_, __,__,__,__,__,__}, // __BB ______ ______
{1,0, __,__, 9,8,7,6,5,4, 15,14,13,12,11,10}, // GG__ RRGGBB RRGGBB
};
#undef _
#undef __
// handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16
static size_t upng__row1(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
uint64_t shuffle64;
switch (format)
{
case UPNG_FORMAT_G8:
case UPNG_FORMAT_GA8:
case UPNG_FORMAT_RGBA8:
shuffle64 = 0x0706050403020100; // nothing to shuffle, identity
break;
case UPNG_FORMAT_BGRA8:
shuffle64 = 0x0704050603000102; // BGRA to RGBA
break;
case UPNG_FORMAT_G16:
case UPNG_FORMAT_GA16:
case UPNG_FORMAT_RGBA16:
shuffle64 = 0x0607040502030001; // swap bytes
break;
case UPNG_FORMAT_BGRA16:
shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA
break;
default:
shuffle64 = 0;
break;
};
#if defined(UPNG__ARCH_X64_AVX2)
int cpuid = upng__cpuid();
if (cpuid & UPNG__CPUID_CLMUL)
{
return upng__row1_avx2(idat, dst, src, last, size, inc, shuffle64);
}
else
{
return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64);
}
#elif defined(UPNG__ARCH_X64)
int cpuid = upng__cpuid();
if (cpuid & UPNG__CPUID_CLMUL)
{
return upng__row1_clmul(idat, dst, src, last, size, inc, shuffle64);
}
else if (cpuid & UPNG__CPUID_SSE41)
{
return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64);
}
#elif defined(UPNG__ARCH_ARM64)
return upng__row1_arm64(idat, dst, src, last, size, inc, shuffle64);
#else
(void)shuffle64;
#endif
return 0;
}
// handles RGB8, BGR8, RGB16, BGR16
static size_t upng__row3(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
const uint8_t (*shuffle)[16];
switch (format)
{
case UPNG_FORMAT_RGB8:
shuffle = upng__shuffle_RGB8;
break;
case UPNG_FORMAT_BGR8:
shuffle = upng__shuffle_BGR8;
break;
case UPNG_FORMAT_RGB16:
shuffle = upng__shuffle_RGB16;
break;
case UPNG_FORMAT_BGR16:
shuffle = upng__shuffle_BGR16;
break;
default:
shuffle = NULL;
break;
}
#if defined(UPNG__ARCH_X64)
int cpuid = upng__cpuid();
if (cpuid & UPNG__CPUID_CLMUL)
{
return upng__row3_clmul(idat, dst, src, last, size, inc, shuffle);
}
else if (cpuid & UPNG__CPUID_SSE41)
{
return upng__row3_sse4(idat, dst, src, last, size, inc, shuffle);
}
#elif defined(UPNG__ARCH_ARM64)
return upng__row3_arm64(idat, dst, src, last, size, inc, shuffle);
#else
(void)shuffle;
#endif
return 0;
}
static void upng__row(upng__idat* idat, uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter)
{
// NONE filter is same as UP when previous row is all 0 values
static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 };
const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : src - pitch;
size_t inc;
size_t used;
if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16)
{
inc = filter == UPNG_FILTER_NONE ? 0 : 48;
used = upng__row3(idat, dst, src, last, size, inc, format);
}
else
{
inc = filter == UPNG_FILTER_NONE ? 0 : 16;
used = upng__row1(idat, dst, src, last, size, inc, format);
}
last += inc == 0 ? 0 : used;
src += used;
dst += used;
size -= used;
uint8_t* tail = dst;
size_t tail_size = size;
// size < inc
switch (format)
{
case UPNG_FORMAT_G8: // 16 pixels per 16 bytes
inc /= 16; // 0 or 1
while (size != 0)
{
dst[0] = src[0] - last[0];
last += inc;
src += 1;
dst += 1;
size -= 1;
}
break;
case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes
inc /= 8; // 0 or 2
while (size != 0)
{
dst[0] = src[0] - last[0];
dst[1] = src[1] - last[1];
last += inc;
src += 2;
dst += 2;
size -= 2;
}
break;
case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes
inc /= 16; // 0 or 3
while (size != 0)
{
dst[0] = src[0] - last[0];
dst[1] = src[1] - last[1];
dst[2] = src[2] - last[2];
last += inc;
src += 3;
dst += 3;
size -= 3;
}
break;
case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes
inc /= 16; // 0 or 3
while (size != 0)
{
dst[0] = src[2] - last[2];
dst[1] = src[1] - last[1];
dst[2] = src[0] - last[0];
last += inc;
src += 3;
dst += 3;
size -= 3;
}
break;
case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes
inc /= 4; // 0 or 4
while (size != 0)
{
dst[0] = src[0] - last[0];
dst[1] = src[1] - last[1];
dst[2] = src[2] - last[2];
dst[3] = src[3] - last[3];
last += inc;
src += 4;
dst += 4;
size -= 4;
}
break;
case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes
inc /= 4; // 0 or 4
while (size != 0)
{
dst[0] = src[2] - last[2];
dst[1] = src[1] - last[1];
dst[2] = src[0] - last[0];
dst[3] = src[3] - last[3];
last += inc;
src += 4;
dst += 4;
size -= 4;
}
break;
case UPNG_FORMAT_G16: // 8 pixels per 16 bytes
inc /= 8; // 0 or 2
while (size != 0)
{
dst[0] = src[1] - last[1];
dst[1] = src[0] - last[0];
last += inc;
src += 2;
dst += 2;
size -= 2;
}
break;
case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes
inc /= 4; // 0 or 4
while (size != 0)
{
dst[0] = src[1] - last[1];
dst[1] = src[0] - last[0];
dst[2] = src[3] - last[3];
dst[3] = src[2] - last[2];
last += inc;
src += 4;
dst += 4;
size -= 4;
}
break;
case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes
inc /= 8; // 0 or 6
while (size != 0)
{
dst[0] = src[1] - last[1];
dst[1] = src[0] - last[0];
dst[2] = src[3] - last[3];
dst[3] = src[2] - last[2];
dst[4] = src[5] - last[5];
dst[5] = src[4] - last[4];
last += inc;
src += 6;
dst += 6;
size -= 6;
}
break;
case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes
inc /= 8; // 0 or 6
while (size != 0)
{
dst[0] = src[5] - last[5];
dst[1] = src[4] - last[4];
dst[2] = src[3] - last[3];
dst[3] = src[2] - last[2];
dst[4] = src[1] - last[1];
dst[5] = src[0] - last[0];
last += inc;
src += 6;
dst += 6;
size -= 6;
}
break;
case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes
inc /= 2; // 0 or 8
while (size != 0)
{
dst[0] = src[1] - last[1];
dst[1] = src[0] - last[0];
dst[2] = src[3] - last[3];
dst[3] = src[2] - last[2];
dst[4] = src[5] - last[5];
dst[5] = src[4] - last[4];
dst[6] = src[7] - last[7];
dst[7] = src[6] - last[6];
last += inc;
src += 8;
dst += 8;
size -= 8;
}
break;
case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes
inc /= 2; // 0 or 8
while (size != 0)
{
dst[0] = src[5] - last[5];
dst[1] = src[4] - last[4];
dst[2] = src[3] - last[3];
dst[3] = src[2] - last[2];
dst[4] = src[1] - last[1];
dst[5] = src[0] - last[0];
dst[6] = src[7] - last[7];
dst[7] = src[6] - last[6];
last += inc;
src += 8;
dst += 8;
size -= 8;
}
break;
}
idat->adler = upng__adler32(idat->adler, tail, tail_size);
idat->crc = upng__crc32(idat->crc, tail, tail_size);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16
static size_t upng__unrow1(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
uint64_t shuffle64;
switch (format)
{
case UPNG_FORMAT_G8:
case UPNG_FORMAT_GA8:
case UPNG_FORMAT_RGBA8:
shuffle64 = 0x0706050403020100; // nothing to shuffle, identity
break;
case UPNG_FORMAT_BGRA8:
shuffle64 = 0x0704050603000102; // BGRA to RGBA
break;
case UPNG_FORMAT_G16:
case UPNG_FORMAT_GA16:
case UPNG_FORMAT_RGBA16:
shuffle64 = 0x0607040502030001; // swap bytes
break;
case UPNG_FORMAT_BGRA16:
shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA
break;
default:
shuffle64 = 0;
break;
};
#if defined(UPNG__ARCH_X64_AVX2)
return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64);
#elif defined(UPNG__ARCH_X64)
int cpuid = upng__cpuid();
if (cpuid & UPNG__CPUID_SSE41)
{
return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64);
}
#elif defined(UPNG__ARCH_ARM64)
return upng__unrow1_arm64(dst, src, last, size, inc, shuffle64);
#else
(void)shuffle64;
#endif
return 0;
}
// handles RGB8, BGR8, RGB16, BGR16
static size_t upng__unrow3(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
const uint8_t(*shuffle)[16];
switch (format)
{
case UPNG_FORMAT_RGB8:
shuffle = upng__shuffle_RGB8;
break;
case UPNG_FORMAT_BGR8:
shuffle = upng__shuffle_BGR8;
break;
case UPNG_FORMAT_RGB16:
shuffle = upng__shuffle_RGB16;
break;
case UPNG_FORMAT_BGR16:
shuffle = upng__shuffle_BGR16;
break;
default:
shuffle = NULL;
break;
}
#if defined(UPNG__ARCH_X64)
int cpuid = upng__cpuid();
if (cpuid & UPNG__CPUID_SSE41)
{
return upng__unrow3_sse4(dst, src, last, size, inc, shuffle);
}
#elif defined(UPNG__ARCH_ARM64)
return upng__unrow3_arm64(dst, src, last, size, inc, shuffle);
#else
(void)shuffle;
#endif
return 0;
}
static void upng__unrow(uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter)
{
// NONE filter is same as UP when previous row is all 0 values
static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 };
const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : dst - pitch;
size_t inc;
size_t used;
if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16)
{
inc = filter == UPNG_FILTER_NONE ? 0 : 48;
used = upng__unrow3(dst, src, last, size, inc, format);
}
else
{
inc = filter == UPNG_FILTER_NONE ? 0 : 16;
used = upng__unrow1(dst, src, last, size, inc, format);
}
last += inc == 0 ? 0 : used;
src += used;
dst += used;
size -= used;
// size < inc
switch (format)
{
case UPNG_FORMAT_G8: // 16 pixels per 16 bytes
inc /= 16; // 0 or 1
while (size != 0)
{
dst[0] = src[0] + last[0];
last += inc;
src += 1;
dst += 1;
size -= 1;
}
break;
case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes
inc /= 8; // 0 or 2
while (size != 0)
{
dst[0] = src[0] + last[0];
dst[1] = src[1] + last[1];
last += inc;
src += 2;
dst += 2;
size -= 2;
}
break;
case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes
inc /= 16; // 0 or 3
while (size != 0)
{
dst[0] = src[0] + last[0];
dst[1] = src[1] + last[1];
dst[2] = src[2] + last[2];
last += inc;
src += 3;
dst += 3;
size -= 3;
}
break;
case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes
inc /= 16; // 0 or 3
while (size != 0)
{
dst[0] = src[2] + last[0];
dst[1] = src[1] + last[1];
dst[2] = src[0] + last[2];
last += inc;
src += 3;
dst += 3;
size -= 3;
}
break;
case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes
inc /= 4; // 0 or 4
while (size != 0)
{
dst[0] = src[0] + last[0];
dst[1] = src[1] + last[1];
dst[2] = src[2] + last[2];
dst[3] = src[3] + last[3];
last += inc;
src += 4;
dst += 4;
size -= 4;
}
break;
case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes
inc /= 4; // 0 or 4
while (size != 0)
{
dst[0] = src[2] + last[0];
dst[1] = src[1] + last[1];
dst[2] = src[0] + last[2];
dst[3] = src[3] + last[3];
last += inc;
src += 4;
dst += 4;
size -= 4;
}
break;
case UPNG_FORMAT_G16: // 8 pixels per 16 bytes
inc /= 8; // 0 or 2
while (size != 0)
{
dst[0] = src[1] + last[0];
dst[1] = src[0] + last[1];
last += inc;
src += 2;
dst += 2;
size -= 2;
}
break;
case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes
inc /= 4; // 0 or 4
while (size != 0)
{
dst[0] = src[1] + last[0];
dst[1] = src[0] + last[1];
dst[2] = src[3] + last[2];
dst[3] = src[2] + last[3];
last += inc;
src += 4;
dst += 4;
size -= 4;
}
break;
case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes
inc /= 8; // 0 or 6
while (size != 0)
{
dst[0] = src[1] + last[0];
dst[1] = src[0] + last[1];
dst[2] = src[3] + last[2];
dst[3] = src[2] + last[3];
dst[4] = src[5] + last[4];
dst[5] = src[4] + last[5];
last += inc;
src += 6;
dst += 6;
size -= 6;
}
break;
case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes
inc /= 8; // 0 or 6
while (size != 0)
{
dst[0] = src[5] + last[0];
dst[1] = src[4] + last[1];
dst[2] = src[3] + last[2];
dst[3] = src[2] + last[3];
dst[4] = src[1] + last[4];
dst[5] = src[0] + last[5];
last += inc;
src += 6;
dst += 6;
size -= 6;
}
break;
case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes
inc /= 2; // 0 or 8
while (size != 0)
{
dst[0] = src[1] + last[0];
dst[1] = src[0] + last[1];
dst[2] = src[3] + last[2];
dst[3] = src[2] + last[3];
dst[4] = src[5] + last[4];
dst[5] = src[4] + last[5];
dst[6] = src[7] + last[6];
dst[7] = src[6] + last[7];
last += inc;
src += 8;
dst += 8;
size -= 8;
}
break;
case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes
inc /= 2; // 0 or 8
while (size != 0)
{
dst[0] = src[5] + last[0];
dst[1] = src[4] + last[1];
dst[2] = src[3] + last[2];
dst[3] = src[2] + last[3];
dst[4] = src[1] + last[4];
dst[5] = src[0] + last[5];
dst[6] = src[7] + last[6];
dst[7] = src[6] + last[7];
last += inc;
src += 8;
dst += 8;
size -= 8;
}
break;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
static const char upng__sig[] = "\x89PNG\r\n\x1a\n";
static const size_t upng__ihdr_size = 13;
// max chunk size
static const size_t upng__max_chunk_size = (1U << 31) - 1;
static const uint32_t upng__bpp[] =
{
1, // G8
2, // GA8
3, // RGB8
3, // BGR8
4, // RGBA8
4, // BGRA8
2, // G16
4, // GA16
6, // RGB16
6, // BGR16
8, // RGBA16
8, // BGRA16
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter)
{
if (width == 0 || height == 0)
{
// bad width or height
return 0;
}
const uint32_t bpp = upng__bpp[format];
if (pitch == 0)
{
pitch = width * bpp;
}
if ((size_t)width * height < (size_t)width || (size_t)width * height >= (size_t)1 << 48)
{
// width and height too big
return 0;
}
if ((size_t)pitch * bpp < (size_t)pitch)
{
// pitch too large, overflows size_t
return 0;
}
static const char iend_chunk[] = "\0\0\0\0IEND\xae\x42\x60\x82";
// max zlib block size
const uint32_t max_block_size = 65535;
// how many pixels fit into one zlib block (conservative estimate, because of filter byte)
const uint32_t pixels_per_block = (max_block_size - 1) / bpp;
// how many full zlib blocks needed per row
size_t full_block_count = width / pixels_per_block;
// how many pixels are left
size_t tail_block_pixels = width % pixels_per_block;
// how many bytes in full zlib blocks
size_t full_block_size = full_block_count * (1 + 4 + pixels_per_block * bpp);
// how many bytes in last zlib block
size_t last_block_size = tail_block_pixels ? (1 + 4 + tail_block_pixels * bpp) : 0;
// total size per row including filter byte
size_t size_per_row = 1 + full_block_size + last_block_size;
// how many rows fit into IDAT chunk
size_t rows_per_idat = upng__max_chunk_size / size_per_row;
if (rows_per_idat == 0)
{
// code assumes it can fit at least one full row (including zlib block headers) into IDAT
return 0;
}
if (!dst)
{
size_t size = 0;
// png signature
size += sizeof(upng__sig) - 1;
// IHDR chunk
size += 4 + 4 + upng__ihdr_size + 4;
// first IDAT chunk contains 2 zlib header bytes
size += 4 + 4 + 2 + 4;
// how many full IDAT chunks
size_t full_idat_count = height / rows_per_idat;
// how many rows in last IDAT
size_t tail_idat_rows = height % rows_per_idat;
size += (4 + 4 + rows_per_idat * size_per_row + 4) * full_idat_count;
size += tail_idat_rows ? (4 + 4 + tail_idat_rows * size_per_row + 4) : 0;
// last IDAT chunk with empty zlib block & adler32
size += 4 + 4 + (1 + 4) + (4) + 4;
// IEND chunk
size += sizeof(iend_chunk) - 1;
return size;
}
upng__crc32_init();
uint8_t* out = (uint8_t*)dst;
// file signature, https://www.w3.org/TR/png/#5PNG-file-signature
for (size_t i = 0; i < sizeof(upng__sig) - 1; i++)
{
*out++ = (uint8_t)upng__sig[i];
}