mmozeiko/upng.h

## upng.h
#pragma once

// uncompressed png writer & reader
// supports only 8-bit and 16-bit formats

// Performance comparison for 8192x8192 BGRA8 image (256MB)
// Compiled with "clang -O2", AVX2 requires extra "-mavx2" or "/arch:AVX2" argument
//
// For libpng (compressed) uses default libpng/zlib compression settings
// For libpng (uncompressed) case following two functions are used:
//   png_set_compression_level() with Z_NO_COMPRESSION
//   png_set_filter() with PNG_FILTER_NONE
//
// Ryzen 5950x
//   upng (AVX2)           =    22.9 msec (11157.3 MB/s), read =  20.5 msec (12499.4 MB/s)
//   upng                  =    27.7 msec ( 9254.6 MB/s), read =  20.8 msec (12296.8 MB/s)
//   libpng (uncompressed) =   169.9 msec ( 1506.9 MB/s), read = 167.5 msec ( 1528.6 MB/s)
//   libpng (compressed)   =  2148.1 msec (  119.2 MB/s), read = 503.5 msec (  508.4 MB/s)
//
// Raspberry Pi4 (-march=armv8+crc)
//   upng                  =  182.9 msec (1399.7 MB/s), read =  110.8 msec (2310.8 MB/s)
//   libpng (uncompressed) = 1192.7 msec ( 214.6 MB/s), read = 1211.8 msec ( 211.3 MB/s)
//   libpng (compressed)   = 9396.8 msec (  27.2 MB/s), read = 1874.6 msec ( 136.6 MB/s)
//
// Apple M1 (-march=armv8+crc+crypto)
//   upng                  =   22.2 msec (11523.7 MB/s), read =  8.9 msec (28622.5 MB/s)
//   libpng (uncompressed) =   93.3 msec ( 2743.3 MB/s), read = 66.6 msec ( 3841.8 MB/s)
//   libpng (compressed)   = 2038.6 msec (  125.6 MB/s), read = 90.4 msec ( 2832.5 MB/s)

#include <stddef.h>
#include <stdint.h>

typedef enum {
	UPNG_FORMAT_G8,
	UPNG_FORMAT_GA8,
	UPNG_FORMAT_RGB8,
	UPNG_FORMAT_BGR8,
	UPNG_FORMAT_RGBA8,
	UPNG_FORMAT_BGRA8,
	UPNG_FORMAT_G16,
	UPNG_FORMAT_GA16,
	UPNG_FORMAT_RGB16,
	UPNG_FORMAT_BGR16,
	UPNG_FORMAT_RGBA16,
	UPNG_FORMAT_BGRA16,
} upng_format;

typedef enum {
	UPNG_FILTER_NONE = 0,
	UPNG_FILTER_UP = 2,
} upng_filter;

// if `dst` is NULL then function will quickly return size needed for `dst` (`src` won't be used)
// if `pitch` is 0, then pixels in `src` are tightly packed without any padding bytes between rows
// returns 0 for unsupported parameter values
static size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter);

// output BGR/BGRA format instead of RGB/RGBA
#define UPNG_READ_SWAP_TO_BGR 1

// if `dst` is NULL then function will quickly return `width` / `height` / `format` values from png header
// if `pitch` is 0, then pixels in `dst` will be tightly packed without any padding bytes between rows
// returns total size of image - `pitch` multiplied by `height`
// returns 0 if png file cannot be successfully parsed or is unsupported
// function does NOT verify CRC32 or ADLER32 checksums
static size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags);

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// optional defines:
// UPNG_DISABLE_AVX2 - do not use AVX2 codepath, even if AVX2 is allowed by compiler
// UPNG_USE_ARM64_PMUL - prefer to use ARM64 PMUL instruction instead of CRC32 on non-Apple targets
//                       this may be slower than using CRC32 instruction, depends on CPU

#if defined(_M_AMD64) || defined(__x86_64__)
#	define UPNG__ARCH_X64
#elif defined(_M_ARM64) || defined(__aarch64__)
#	define UPNG__ARCH_ARM64
#endif

#if defined(UPNG__ARCH_X64)
#	if defined(__clang__) || defined(__GNUC__)
#		include <cpuid.h>
#		define UPNG__CPUID(num, regs) __cpuid(num, regs[0], regs[1], regs[2], regs[3])
#		define UPNG__CPUID2(num, sub, regs) __cpuid_count(num, sub, regs[0], regs[1], regs[2], regs[3])
#		define UPNG__TARGET(str) __attribute__((target(str)))
#	else
#		include <intrin.h>
#		define UPNG__CPUID(num, regs) __cpuid(regs, num)
#		define UPNG__CPUID2(num, sub, regs) __cpuidex(regs, num, sub)
#		define UPNG__TARGET(str)
#	endif
#	if defined(__AVX2__) && !defined(UPNG_DISABLE_AVX2)
#		define UPNG__ARCH_X64_AVX2
#		include <immintrin.h>
#		if !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER > 1930 && _MSC_VER < 1936)
			// broken MSVC versions that do not generate VEX encoded VPCLMULQDQ instruction
			// see https://developercommunity.visualstudio.com/t/_mm_clmulepi64_si128-intrinsic-no-longer/10277103
#			pragma message("WARNING: this MSVC compiler version produces very bad performance with AVX2 codegen!")
#			undef UPNG__ARCH_X64_AVX2
#		elif !defined(__clang__) && defined(_MSC_VER) && (_MSC_VER == 1938)
			// broken MSVC version that generate AVX512 instructions in AVX2 code
			// see https://developercommunity.visualstudio.com/t/Invalid-AVX512-instructions-generated-wh/10521872
#			pragma message("WARNING: this MSVC compiler version produces invalid instructions with AVX2 codegen!")
#			undef UPNG__ARCH_X64_AVX2
#		endif
#	endif
#	include <wmmintrin.h>	// CLMUL	// _mm_clmulepi64_si128
#	include <smmintrin.h>	// SSSE4.1	// _mm_extract_epi32
#	include <tmmintrin.h>	// SSSE3	// _mm_maddubs_epi16, _mm_hadd_epi32, _mm_shuffle_epi8
#	include <emmintrin.h>	// SSE2
#elif defined(UPNG__ARCH_ARM64)
#	include <arm_neon.h>
#	if __ARM_FEATURE_CRC32				// use -march=armv8-a+crc when possible
#		define UPNG__ARM64_CRC32		// __crc32d, __crc32b
#		include <arm_acle.h>
#	endif
#	if __ARM_FEATURE_CRYPTO				// use -march=armv8-a+crypto when possible
#		if defined(__APPLE__) || defined(UPNG_USE_ARM64_PMUL) || !defined(UPNG__ARM64_CRC32)
#			define UPNG__ARM64_CRYPTO	// vmull_p64, vmull_high_p64
#		endif
#	endif
#endif

#if defined(_MSC_VER) && !defined(__clang__)
#	include <intrin.h>
#	define UPNG__ALIGN(n, var) __declspec(align(n)) var
#	define UPNG__MSVC_BARRIER() _ReadWriteBarrier()
#	define UPNG__ASSUME_ALIGNED(ptr, align) (ptr)
#else
#	define UPNG__ALIGN(n, var) var __attribute__((aligned(n)))
#	define UPNG__MSVC_BARRIER() // not need for non-MSVC compiler
#	define UPNG__ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned(ptr, align)
#endif

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#define UPNG__ADLER32_INIT 1U

#define UPNG__ADLER32_MOD 65521

// max amount of bytes to not overflow "b" as uint32_t
// max "b" value = 255*n*(n+1)/2 + (n+1)*(65521-1)
#define UPNG__ADLER32_CHUNK_SIZE 5552

// max amount of 16-byte blocks to use for SIMD
#define UPNG__ADLER32_BLOCKS1 (UPNG__ADLER32_CHUNK_SIZE / 16)
#define UPNG__ADLER32_BLOCKS3 (UPNG__ADLER32_CHUNK_SIZE / 48)
#define UPNG__ADLER32_BLOCKS4 (UPNG__ADLER32_CHUNK_SIZE / 64)

static uint32_t upng__adler32(uint32_t adler, const void* ptr, size_t size)
{
	const uint8_t* bytes = (const uint8_t*)ptr;

	uint32_t a = adler & 0xffff;
	uint32_t b = (adler >> 16);

	// no SIMD here, it'll be used either for small chunk sizes only or without SIMD
	while (size >= UPNG__ADLER32_CHUNK_SIZE)
	{
		for (size_t k = 0; k < UPNG__ADLER32_CHUNK_SIZE; k++)
		{
			a += *bytes++;
			b += a;
		}
		size -= UPNG__ADLER32_CHUNK_SIZE;

		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

	while (size-- != 0)
	{
		a += *bytes++;
		b += a;
	}

	a %= UPNG__ADLER32_MOD;
	b %= UPNG__ADLER32_MOD;

	return a | (b << 16);
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#define UPNG__CRC32_INIT 0U

#if !defined(UPNG__CRC32_TABLE_COUNT)
#	if defined(UPNG__ARM64_CRC32)
#		define UPNG__CRC32_TABLE_COUNT 0 // no need for CRC32 tables if ACLE crc32 instruction can be used
#	elif defined(UPNG__ARCH_X64)
#		define UPNG__CRC32_TABLE_COUNT 16 // for x64 use 16KB table (half of L1 cache)
#	else
#		define UPNG__CRC32_TABLE_COUNT 8  // otherwise be safe and use only 8KB, alternatively set to 4 for 4KB table
#	endif
#endif

#if UPNG__CRC32_TABLE_COUNT != 0
static uint32_t upng__crc32_table[UPNG__CRC32_TABLE_COUNT][256];
#endif

static void upng__crc32_init(void)
{
#if UPNG__CRC32_TABLE_COUNT != 0
	static int init = 0;
	if (!init)
	{
		const uint32_t CRC32_POLY = 0xedb88320;

		for (size_t i = 0; i < 256; i++)
		{
			uint32_t crc = (uint32_t)i;
			for (size_t j = 0; j < 8; j++)
			{
				crc = (crc >> 1) ^ (crc & 1 ? CRC32_POLY : 0);
			}
			upng__crc32_table[0][i] = crc;
		}

		for (size_t i = 1; i < UPNG__CRC32_TABLE_COUNT; i++)
		{
			for (size_t j = 0; j < 256; j++)
			{
				upng__crc32_table[i][j] = (upng__crc32_table[i - 1][j] >> 8) ^ upng__crc32_table[0][upng__crc32_table[i - 1][j] & 0xff];
			}
		}
		init = 1;
	}
#endif
}

static uint32_t upng__crc32(uint32_t crc, const void* ptr, size_t size)
{
	const uint8_t* bytes = (const uint8_t*)ptr;
	crc = ~crc;

	// no SIMD here, it'll be used either for small chunk sizes only or without SIMD

#if defined(UPNG__ARM64_CRC32)
	while (size-- != 0)
	{
		crc = __crc32b(crc, *bytes++);
	}
#else

	while ((((uintptr_t)bytes % 4) != 0) && (size != 0))
	{
		crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++];
		size -= 1;
	}

	// now bytes pointer is 4-byte aligned
	const uint32_t* bytes4 = (const uint32_t*)UPNG__ASSUME_ALIGNED(bytes, 4);

#if UPNG__CRC32_TABLE_COUNT == 16
	while (size >= 16)
	{
		uint32_t b0 = *bytes4++ ^ crc;
		uint32_t b1 = *bytes4++;
		uint32_t b2 = *bytes4++;
		uint32_t b3 = *bytes4++;
		size -= 16;

		// these barriers should not affect anything, but they make MSVC(2022) to generate ~25% faster code
		UPNG__MSVC_BARRIER();
		crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
		UPNG__MSVC_BARRIER();
		crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
		UPNG__MSVC_BARRIER();
		crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
		UPNG__MSVC_BARRIER();
		crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
	}
#elif UPNG__CRC32_TABLE_COUNT == 8
	while (size >= 8)
	{
		uint32_t b0 = *bytes4++ ^ crc;
		uint32_t b1 = *bytes4++;
		size -= 8;

		size_t i0 = (b1 >> 24) & 0xff;
		size_t i1 = (b1 >> 16) & 0xff;
		size_t i2 = (b1 >> 8) & 0xff;
		size_t i3 = b1 & 0xff;
		size_t i4 = (b0 >> 24) & 0xff;
		size_t i5 = (b0 >> 16) & 0xff;
		size_t i6 = (b0 >> 8) & 0xff;
		size_t i7 = b0 & 0xff;

		// similar situation to 16 table count - this make MSVC(2022) to generate ~25% faster code
		UPNG__MSVC_BARRIER();
		crc = upng__crc32_table[0][i0] ^ upng__crc32_table[4][i4];
		UPNG__MSVC_BARRIER();
		crc ^= upng__crc32_table[1][i1] ^ upng__crc32_table[5][i5];
		UPNG__MSVC_BARRIER();
		crc ^= upng__crc32_table[2][i2] ^ upng__crc32_table[6][i6];
		UPNG__MSVC_BARRIER();
		crc ^= upng__crc32_table[3][i3] ^ upng__crc32_table[7][i7];
	}
#elif UPNG__CRC32_TABLE_COUNT == 4
	while (size >= 4)
	{
		uint32_t b0 = *bytes4++ ^ crc;
		size -= 4;
		crc = upng__crc32_table[0][(b0 >> 24) & 0xff] ^ upng__crc32_table[1][(b0 >> 16) & 0xff] ^ upng__crc32_table[2][(b0 >> 8) & 0xff] ^ upng__crc32_table[3][b0 & 0xff];
	}
#endif
	bytes = (const uint8_t*)bytes4;
	while (size-- != 0)
	{
		crc = (crc >> 8) ^ upng__crc32_table[0][(crc & 0xff) ^ *bytes++];
	}
#endif

	return ~crc;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

typedef struct {
	uint32_t crc;	// crc32 for whole chunk + 4 byte type
	uint32_t adler;	// adler32 for zlib payload
} upng__idat;

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#if defined(UPNG__ARCH_X64)

#define UPNG__CPUID_SSE41	(1<<1) // SSSE3+SSE4.1
#define UPNG__CPUID_CLMUL	(1<<2) // SSSE3+SSE4.1+CLMUL

static int upng__cpuid(void)
{
	static int cpuid;
	if (!cpuid)
	{
		int info[4];
		UPNG__CPUID(1, info);

		int detected = (1 << 0);

		if (!!(info[3] & (1 << 9))) // SSSE3 bit
		{
			if (!!(info[3] & (1 << 19))) // SSE4.1 bit
			{
				detected |= UPNG__CPUID_SSE41;

				if (!!(info[3] & (1 << 1))) // CLMUL bit
				{
					detected |= UPNG__CPUID_CLMUL;
				}
			}
		}

		cpuid = detected;
	}
	return cpuid;
}

static size_t UPNG__TARGET("ssse3,sse4.1")
upng__row1_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
	uint8_t* out = dst;
	const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	// adler32
	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
	const __m128i ones = _mm_set1_epi16(1);
	const __m128i zero = _mm_setzero_si128();

	// on input
	// a0 = a

	// a = a0 + x[0]
	// b += a0 + x[0]
	// ...
	// a = a0 + x[0] + x[1]
	// b += 2*a0 + 2*x[0] + 1*x[1]
	// ...
	// a = a0 + x[0] + ... + x[N-1]
	// b += N*a0 + N*x[0] + (N-1)*x[1] + ... + 2*x[N-2] + 1*x[N-1]

	// processing every 16 bytes in an iteration (5552 is multiple of 16)
	// va = a0 + (x[0]+...+x[15]) + (x[16]+..+x[31]) + ...
	// vb = b + (16*x[0]+...+1*x[15]) + (16*x[16]+...+1*x[31]) + ... + (16*X[N-16]+...1*x[N-1])
	// vs = n*a0 + (n-1)*(x[0]+...+x[15]) + (n-2)*(x[16]+...+x[31]) + ... + 1*(x[N-32]+...+x[N-17]) + 0*(x[N-16]+...+x[N-1])
	// where n = N/16

	// vs*16
	// N*a0 + (N-16)*x[0]+...+(N-16)*x[15] + (N-32)*x[16]+...+(N-16)*x[31] + ... + 16*x[N-32]+...+16*x[N-17]

	// vb+vs*16
	// N*a0 + N*x[0] + (N-1)*x[1] + ... + 16*x[N-16] + 15*x[N-15] + ... + 1*x[N-1]

	// for output
	// a = va
	// b = vb+vs*16

	while (size >= 16)
	{
		__m128i vs = zero;
		__m128i va = _mm_cvtsi32_si128(a);
		__m128i vb = _mm_cvtsi32_si128(b);

		// process as many 16-byte blocks as possible
		size_t block_count = size / 16;
		block_count = block_count < UPNG__ADLER32_BLOCKS1 ? block_count : UPNG__ADLER32_BLOCKS1;

		for (size_t i = 0; i < block_count; i++)
		{
			// pixel filtering
			__m128i vlast = _mm_loadu_si128((const __m128i*)last);
			__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
			__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle);
			_mm_storeu_si128((__m128i*)dst, vdst);
			last += inc;
			src += 16;
			dst += 16;
			size -= 16;

			// adler32 update
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));

			// crc32 update
			uint32_t b0 = _mm_extract_epi32(vdst, 0) ^ crc;
			uint32_t b1 = _mm_extract_epi32(vdst, 1);
			uint32_t b2 = _mm_extract_epi32(vdst, 2);
			uint32_t b3 = _mm_extract_epi32(vdst, 3);
			UPNG__MSVC_BARRIER();
			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
		}

		// vb += vs * 16
		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

		// a = sum(va)
		va = _mm_hadd_epi32(va, va);
		va = _mm_hadd_epi32(va, va);
		a = _mm_cvtsi128_si32(va);

		// b = sum(vb)
		vb = _mm_hadd_epi32(vb, vb);
		vb = _mm_hadd_epi32(vb, vb);
		b = _mm_cvtsi128_si32(vb);

		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

	idat->adler = a | (b << 16);
	idat->crc = ~crc;

	return dst - out;
}

#if defined(UPNG__ARCH_X64_AVX2)

static size_t UPNG__TARGET("ssse3,sse4.1,avx2,pclmul")
upng__row1_avx2(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
	if (size < 16)
	{
		return 0;
	}

	uint8_t* out = dst;
	const __m128i shuffle128 = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	// crc32
	const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1);
	const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
	const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
	const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
	const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0); // low 32 bits

	// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	// https://web.archive.org/web/20230315165408/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

	// calculates m=(1<<n)%P, which is 32-bit value
	// returns 33-bit value reflect(m<<32,64) << 1
	//
	// uint64_t crc32_pow2mod(size_t n)
	// {
	// 	uint32_t mod = CRC32_POLY;
	// 	for (size_t i = 0; i < n - 32; i++)
	// 	{
	// 		mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0);
	// 	}
	// 	// bits are already reflected
	// 	return (uint64_t)mod << 1;
	// }

	// calculates d=(1<<64)/P, which is 33-bit value (65-32=33)
	// returns 33-bit value reflect(d,33)
	//
	// uint64_t crc32_2pow64div(void)
	// {
	// 	uint64_t div = 1;
	// 	uint32_t mod = CRC32_POLY;
	// 	for (size_t i = 0; i < 32; i++)
	// 	{
	// 		div |= (mod&1ULL) << (i+1);
	// 		mod = (mod >> 1) ^ (mod & 1 ? CRC32_POLY : 0);
	// 	}
	// 	// bits are already reflected
	// 	return div;
	// }

	// k1 = crc32_pow2mod(4*128+32)
	// k2 = crc32_pow2mod(4*128-32
	// k3 = crc32_pow2mod(128+32)
	// k4 = crc32_pow2mod(128-32)
	// k5 = crc32_pow2mod(64)
	// P = ((uint64_t)CRC32_POLY << 1) | 1
	// u = crc32_2pow64div()

	// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0
	__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
	__m128i x0 = _mm_cvtsi32_si128(crc);

	if (size >= 64)
	{
		const __m256i shuffle256 = _mm256_broadcastsi128_si256(shuffle128);

		// adler32
		const __m256i cmul = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
		const __m256i ones = _mm256_set1_epi16(1);
		const __m256i zero = _mm256_setzero_si256();

		// crc32
		__m128i x1 = _mm_setzero_si128();
		__m128i x2 = _mm_setzero_si128();
		__m128i x3 = _mm_setzero_si128();

		while (size >= 64)
		{
			__m256i vs = zero;
			__m256i va = _mm256_zextsi128_si256(_mm_cvtsi32_si128(a));
			__m256i vb = _mm256_zextsi128_si256(_mm_cvtsi32_si128(b));

			// process as many 64-byte blocks as possible
			size_t block_count = size / 64;
			block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;

			for (size_t i = 0; i < block_count; i++)
			{
				// pixel filtering
				__m256i vlast0 = _mm256_loadu_si256((const __m256i*)last + 0);
				__m256i vlast1 = _mm256_loadu_si256((const __m256i*)last + 1);
				__m256i vsrc0 = _mm256_loadu_si256((const __m256i*)src + 0);
				__m256i vsrc1 = _mm256_loadu_si256((const __m256i*)src + 1);
				__m256i vdst0 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc0, vlast0), shuffle256);
				__m256i vdst1 = _mm256_shuffle_epi8(_mm256_sub_epi8(vsrc1, vlast1), shuffle256);
				_mm256_storeu_si256((__m256i*)dst + 0, vdst0);
				_mm256_storeu_si256((__m256i*)dst + 1, vdst1);
				last += inc * 4;
				src += 64;
				dst += 64;
				size -= 64;

				// adler32 update
				vs = _mm256_add_epi32(vs, va);
				va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst0, zero));
				vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst0, cmul), ones));
				vs = _mm256_add_epi32(vs, va);
				va = _mm256_add_epi32(va, _mm256_sad_epu8(vdst1, zero));
				vb = _mm256_add_epi32(vb, _mm256_madd_epi16(_mm256_maddubs_epi16(vdst1, cmul), ones));

				// crc32 update
				x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
				x0 = _mm_xor_si128(x0, _mm256_castsi256_si128(vdst0));
				x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
				x1 = _mm_xor_si128(x1, _mm256_extracti128_si256(vdst0, 1));
				x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
				x2 = _mm_xor_si128(x2, _mm256_castsi256_si128(vdst1));
				x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11));
				x3 = _mm_xor_si128(x3, _mm256_extracti128_si256(vdst1, 1));
				crc_mul = k1k2;
			}

			vb = _mm256_add_epi32(vb, _mm256_slli_epi32(vs, 5));

			// a = sum(va)
			__m128i asum = _mm_add_epi32(_mm256_castsi256_si128(va), _mm256_extracti128_si256(va, 1));
			asum = _mm_hadd_epi32(asum, asum);
			asum = _mm_hadd_epi32(asum, asum);
			a = _mm_cvtsi128_si32(asum);

			// b = sum(vb)
			__m128i bsum = _mm_add_epi32(_mm256_castsi256_si128(vb), _mm256_extracti128_si256(vb, 1));
			bsum = _mm_hadd_epi32(bsum, bsum);
			bsum = _mm_hadd_epi32(bsum, bsum);
			b = _mm_cvtsi128_si32(bsum);

			a %= UPNG__ADLER32_MOD;
			b %= UPNG__ADLER32_MOD;
		}

		// reduce 512-bit to 128-bit
		x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
		x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
		x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
		crc_mul = k3k4;
	}

	if (size >= 16)
	{
		const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
		const __m128i ones = _mm_set1_epi16(1);
		const __m128i zero = _mm_setzero_si128();

		__m128i vs = zero;
		__m128i va = _mm_cvtsi32_si128(a);
		__m128i vb = _mm_cvtsi32_si128(b);

		// only 1 to 3 iterations
		while (size >= 16)
		{
			__m128i vlast = _mm_loadu_si128((const __m128i*)last);
			__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
			__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle128);
			_mm_storeu_si128((__m128i*)dst, vdst);
			last += inc;
			src += 16;
			dst += 16;
			size -= 16;

			// adler32 update
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));

			// crc32 update
			x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
			x0 = _mm_xor_si128(x0, vdst);
			crc_mul = k3k4;
		}

		// vb += vs * 16
		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

		// a = sum(va)
		va = _mm_hadd_epi32(va, va);
		va = _mm_hadd_epi32(va, va);
		a = _mm_cvtsi128_si32(va);

		// b = sum(vb)
		vb = _mm_hadd_epi32(vb, vb);
		vb = _mm_hadd_epi32(vb, vb);
		b = _mm_cvtsi128_si32(vb);

		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

	idat->adler = a | (b << 16);

	// reduce 128-bit to 96-bit
	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
	// reduce 96-bit to 64-bit
	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
	// reduce 64-bit to 32-bit
	__m128i x1;
	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
	crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
	idat->crc = ~crc;

	return dst - out;
}

#else // UPNG__ARCH_X64_AVX2

static size_t UPNG__TARGET("ssse3,sse4.1,pclmul")
upng__row1_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
	if (size < 16)
	{
		return 0;
	}

	uint8_t* out = dst;
	const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	// adler32
	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
	const __m128i ones = _mm_set1_epi16(1);
	const __m128i zero = _mm_setzero_si128();

	// crc32
	const __m128i k1k2 = _mm_setr_epi32(0x54442bd4, 1, 0xc6e41596, 1);
	const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
	const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
	const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
	const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0);

	// first iteration does not need to multiply, just leave x0 unchanged: x0*1 => x0
	__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
	__m128i x0 = _mm_cvtsi32_si128(crc);

	if (size >= 64)
	{
		__m128i x1 = zero;
		__m128i x2 = zero;
		__m128i x3 = zero;

		while (size >= 64)
		{
			__m128i vs = zero;
			__m128i va = _mm_cvtsi32_si128(a);
			__m128i vb = _mm_cvtsi32_si128(b);

			// process as many 64-byte blocks as possible
			size_t block_count = size / 64;
			block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;

			for (size_t i = 0; i < block_count; i++)
			{
				// pixel filtering
				__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
				__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
				__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
				__m128i vlast3 = _mm_loadu_si128((const __m128i*)last + 3);
				__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
				__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
				__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
				__m128i vsrc3 = _mm_loadu_si128((const __m128i*)src + 3);
				__m128i vdst0 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc0, vlast0), shuffle);
				__m128i vdst1 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc1, vlast1), shuffle);
				__m128i vdst2 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc2, vlast2), shuffle);
				__m128i vdst3 = _mm_shuffle_epi8(_mm_sub_epi8(vsrc3, vlast3), shuffle);
				_mm_storeu_si128((__m128i*)dst + 0, vdst0);
				_mm_storeu_si128((__m128i*)dst + 1, vdst1);
				_mm_storeu_si128((__m128i*)dst + 2, vdst2);
				_mm_storeu_si128((__m128i*)dst + 3, vdst3);
				last += inc * 4;
				src += 64;
				dst += 64;
				size -= 64;

				// adler32 update
				vs = _mm_add_epi32(vs, va);
				va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
				vs = _mm_add_epi32(vs, va);
				va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
				vs = _mm_add_epi32(vs, va);
				va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));
				vs = _mm_add_epi32(vs, va);
				va = _mm_add_epi32(va, _mm_sad_epu8(vdst3, zero));
				vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst3, cmul), ones));

				// crc32 update
				x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
				x0 = _mm_xor_si128(x0, vdst0);
				x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
				x1 = _mm_xor_si128(x1, vdst1);
				x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
				x2 = _mm_xor_si128(x2, vdst2);
				x3 = _mm_xor_si128(_mm_clmulepi64_si128(x3, crc_mul, 0x00), _mm_clmulepi64_si128(x3, crc_mul, 0x11));
				x3 = _mm_xor_si128(x3, vdst3);
				crc_mul = k1k2;
			}

			vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

			// a = sum(va)
			va = _mm_hadd_epi32(va, va);
			va = _mm_hadd_epi32(va, va);
			a = _mm_cvtsi128_si32(va);

			// b = sum(vb)
			vb = _mm_hadd_epi32(vb, vb);
			vb = _mm_hadd_epi32(vb, vb);
			b = _mm_cvtsi128_si32(vb);

			a %= UPNG__ADLER32_MOD;
			b %= UPNG__ADLER32_MOD;
		}

		// reduce 512-bit to 128-bit
		x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
		x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
		x0 = _mm_xor_si128(x3, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
		crc_mul = k3k4;
	}

	if (size >= 16)
	{
		__m128i vs = zero;
		__m128i va = _mm_cvtsi32_si128(a);
		__m128i vb = _mm_cvtsi32_si128(b);

		// only 1 to 3 iterations
		while (size >= 16)
		{
			__m128i vlast = _mm_loadu_si128((const __m128i*)last);
			__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
			__m128i vdst = _mm_shuffle_epi8(_mm_sub_epi8(vsrc, vlast), shuffle);
			_mm_storeu_si128((__m128i*)dst, vdst);
			last += inc;
			src += 16;
			dst += 16;
			size -= 16;

			// adler32 update
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst, cmul), ones));

			// crc32 update
			x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
			x0 = _mm_xor_si128(x0, vdst);
			crc_mul = k3k4;
		}

		// vb += vs * 16
		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

		// a = sum(va)
		va = _mm_hadd_epi32(va, va);
		va = _mm_hadd_epi32(va, va);
		a = _mm_cvtsi128_si32(va);

		// b = sum(vb)
		vb = _mm_hadd_epi32(vb, vb);
		vb = _mm_hadd_epi32(vb, vb);
		b = _mm_cvtsi128_si32(vb);

		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

	idat->adler = a | (b << 16);

	// reduce 128-bit to 96-bit
	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
	// reduce 96-bit to 64-bit
	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
	// reduce 64-bit to 32-bit
	__m128i x1;
	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
	crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
	idat->crc = ~crc;

	return dst - out;
}

#endif // UPNG__ARCH_X64_AVX2

static size_t UPNG__TARGET("ssse3,sse4.1")
upng__row3_sse4(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
	uint8_t* out = dst;
	const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
	const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
	const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
	const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
	const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
	const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
	const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	// adler32
	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
	const __m128i ones = _mm_set1_epi16(1);
	const __m128i zero = _mm_setzero_si128();

	while (size >= 48)
	{
		__m128i vs = zero;
		__m128i va = _mm_cvtsi32_si128(a);
		__m128i vb = _mm_cvtsi32_si128(b);

		size_t block_count = size / 48;
		block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;

		for (size_t i = 0; i < block_count; i++)
		{
			// pixel filtering
			__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
			__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
			__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
			__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
			__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
			__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
			__m128i v0 = _mm_sub_epi8(vsrc0, vlast0);
			__m128i v1 = _mm_sub_epi8(vsrc1, vlast1);
			__m128i v2 = _mm_sub_epi8(vsrc2, vlast2);
			__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01));
			__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12));
			__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22));
			_mm_storeu_si128((__m128i*)dst + 0, vdst0);
			_mm_storeu_si128((__m128i*)dst + 1, vdst1);
			_mm_storeu_si128((__m128i*)dst + 2, vdst2);
			last += inc;
			src += 48;
			dst += 48;
			size -= 48;

			// adler32 update
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));

			// crc32 update
			uint32_t b0 = _mm_extract_epi32(vdst0, 0) ^ crc;
			uint32_t b1 = _mm_extract_epi32(vdst0, 1);
			uint32_t b2 = _mm_extract_epi32(vdst0, 2);
			uint32_t b3 = _mm_extract_epi32(vdst0, 3);
			UPNG__MSVC_BARRIER();
			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];

			b0 = _mm_extract_epi32(vdst1, 0) ^ crc;
			b1 = _mm_extract_epi32(vdst1, 1);
			b2 = _mm_extract_epi32(vdst1, 2);
			b3 = _mm_extract_epi32(vdst1, 3);
			UPNG__MSVC_BARRIER();
			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];

			b0 = _mm_extract_epi32(vdst2, 0) ^ crc;
			b1 = _mm_extract_epi32(vdst2, 1);
			b2 = _mm_extract_epi32(vdst2, 2);
			b3 = _mm_extract_epi32(vdst2, 3);
			UPNG__MSVC_BARRIER();
			crc = upng__crc32_table[0][(b3 >> 24) & 0xff] ^ upng__crc32_table[1][(b3 >> 16) & 0xff] ^ upng__crc32_table[2][(b3 >> 8) & 0xff] ^ upng__crc32_table[3][b3 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[4][(b2 >> 24) & 0xff] ^ upng__crc32_table[5][(b2 >> 16) & 0xff] ^ upng__crc32_table[6][(b2 >> 8) & 0xff] ^ upng__crc32_table[7][b2 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[8][(b1 >> 24) & 0xff] ^ upng__crc32_table[9][(b1 >> 16) & 0xff] ^ upng__crc32_table[10][(b1 >> 8) & 0xff] ^ upng__crc32_table[11][b1 & 0xff];
			UPNG__MSVC_BARRIER();
			crc ^= upng__crc32_table[12][(b0 >> 24) & 0xff] ^ upng__crc32_table[13][(b0 >> 16) & 0xff] ^ upng__crc32_table[14][(b0 >> 8) & 0xff] ^ upng__crc32_table[15][b0 & 0xff];
		}

		// vb += vs * 16
		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

		// a = sum(va)
		va = _mm_hadd_epi32(va, va);
		va = _mm_hadd_epi32(va, va);
		a = _mm_cvtsi128_si32(va);

		// b = sum(vb)
		vb = _mm_hadd_epi32(vb, vb);
		vb = _mm_hadd_epi32(vb, vb);
		b = _mm_cvtsi128_si32(vb);

		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

	idat->adler = a | (b << 16);
	idat->crc = ~crc;

	return dst - out;
}

static size_t UPNG__TARGET("ssse3,sse4.1,pclmul")
upng__row3_clmul(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t (*shuffle)[16])
{
	if (size < 48)
	{
		return 0;
	}

	uint8_t* out = dst;
	const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
	const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
	const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
	const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
	const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
	const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
	const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	// adler32
	const __m128i cmul = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
	const __m128i ones = _mm_set1_epi16(1);
	const __m128i zero = _mm_setzero_si128();

	// crc32
	// k1 = crc32_pow2mod(3*128+32)
	// k2 = crc32_pow2mod(3*128-32)
	const __m128i k1k2 = _mm_setr_epi32(0x3db1ecdc, 0, 0x74359406, 1);
	const __m128i k3k4 = _mm_setr_epi32(0x751997d0, 1, 0xccaa009e, 0);
	const __m128i k5k0 = _mm_setr_epi32(0x63cd6124, 1, 0x00000000, 0);
	const __m128i poly = _mm_setr_epi32(0xdb710641, 1, 0xf7011641, 0);
	const __m128i mask32 = _mm_setr_epi32(-1, 0, 0, 0);

	__m128i crc_mul = _mm_setr_epi32(1, 0, 0, 0);
	__m128i x0 = _mm_cvtsi32_si128(crc);
	__m128i x1 = zero;
	__m128i x2 = zero;

	while (size >= 48)
	{
		__m128i vs = zero;
		__m128i va = _mm_cvtsi32_si128(a);
		__m128i vb = _mm_cvtsi32_si128(b);

		// process as many 3x16-byte blocks as possible
		size_t block_count = size / 48;
		block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;

		for (size_t i = 0; i < block_count; i++)
		{
			// pixel filtering
			__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
			__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
			__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
			__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
			__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
			__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
			__m128i v0 = _mm_sub_epi8(vsrc0, vlast0);
			__m128i v1 = _mm_sub_epi8(vsrc1, vlast1);
			__m128i v2 = _mm_sub_epi8(vsrc2, vlast2);
			__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(v0, s00), _mm_shuffle_epi8(v1, s01));
			__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v0, s10), _mm_shuffle_epi8(v1, s11)), _mm_shuffle_epi8(v2, s12));
			__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(v1, s21), _mm_shuffle_epi8(v2, s22));
			_mm_storeu_si128((__m128i*)dst + 0, vdst0);
			_mm_storeu_si128((__m128i*)dst + 1, vdst1);
			_mm_storeu_si128((__m128i*)dst + 2, vdst2);
			last += inc;
			src += 48;
			dst += 48;
			size -= 48;

			// adler32 update
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst0, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst0, cmul), ones));
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst1, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst1, cmul), ones));
			vs = _mm_add_epi32(vs, va);
			va = _mm_add_epi32(va, _mm_sad_epu8(vdst2, zero));
			vb = _mm_add_epi32(vb, _mm_madd_epi16(_mm_maddubs_epi16(vdst2, cmul), ones));

			// crc32 update
			x0 = _mm_xor_si128(_mm_clmulepi64_si128(x0, crc_mul, 0x00), _mm_clmulepi64_si128(x0, crc_mul, 0x11));
			x0 = _mm_xor_si128(x0, vdst0);
			x1 = _mm_xor_si128(_mm_clmulepi64_si128(x1, crc_mul, 0x00), _mm_clmulepi64_si128(x1, crc_mul, 0x11));
			x1 = _mm_xor_si128(x1, vdst1);
			x2 = _mm_xor_si128(_mm_clmulepi64_si128(x2, crc_mul, 0x00), _mm_clmulepi64_si128(x2, crc_mul, 0x11));
			x2 = _mm_xor_si128(x2, vdst2);
			crc_mul = k1k2;
		}

		// vb += vs * 16
		vb = _mm_add_epi32(vb, _mm_slli_epi32(vs, 4));

		// a = sum(va)
		va = _mm_hadd_epi32(va, va);
		va = _mm_hadd_epi32(va, va);
		a = _mm_cvtsi128_si32(va);

		// b = sum(vb)
		vb = _mm_hadd_epi32(vb, vb);
		vb = _mm_hadd_epi32(vb, vb);
		b = _mm_cvtsi128_si32(vb);

		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

	idat->adler = a | (b << 16);

	// reduce 384-bit to 128-bit
	x0 = _mm_xor_si128(x1, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
	x0 = _mm_xor_si128(x2, _mm_xor_si128(_mm_clmulepi64_si128(x0, k3k4, 0x00), _mm_clmulepi64_si128(x0, k3k4, 0x11)));
	// reduce 128-bit to 96-bit
	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, k3k4, 0x10));
	// reduce 96-bit to 64-bit
	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), k5k0, 0x00));
	// reduce 64-bit to 32-bit
	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), poly, 0x10);
	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), poly, 0x00);
	crc = _mm_extract_epi32(_mm_xor_si128(x0, x1), 1);
	idat->crc = ~crc;

	return dst - out;
}

static size_t UPNG__TARGET("ssse3,sse4.1")
upng__unrow1_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
	uint8_t* out = dst;
	const __m128i shuffle = _mm_set_epi64x(shuffle64 + 0x0808080808080808, shuffle64);

	while (size >= 16)
	{
		__m128i vlast = _mm_loadu_si128((const __m128i*)last);
		__m128i vsrc = _mm_loadu_si128((const __m128i*)src);
		__m128i vdst = _mm_shuffle_epi8(vsrc, shuffle);
		_mm_storeu_si128((__m128i*)dst, _mm_add_epi8(vdst, vlast));
		last += inc;
		src += 16;
		dst += 16;
		size -= 16;
	}

	return dst - out;
}

static size_t UPNG__TARGET("ssse3,sse4.1")
upng__unrow3_sse4(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
	uint8_t* out = dst;
	const __m128i s00 = _mm_load_si128((const __m128i*)(shuffle[0]));
	const __m128i s01 = _mm_load_si128((const __m128i*)(shuffle[1]));
	const __m128i s10 = _mm_load_si128((const __m128i*)(shuffle[2]));
	const __m128i s11 = _mm_load_si128((const __m128i*)(shuffle[3]));
	const __m128i s12 = _mm_load_si128((const __m128i*)(shuffle[4]));
	const __m128i s21 = _mm_load_si128((const __m128i*)(shuffle[5]));
	const __m128i s22 = _mm_load_si128((const __m128i*)(shuffle[6]));

	while (size >= 48)
	{
		__m128i vlast0 = _mm_loadu_si128((const __m128i*)last + 0);
		__m128i vlast1 = _mm_loadu_si128((const __m128i*)last + 1);
		__m128i vlast2 = _mm_loadu_si128((const __m128i*)last + 2);
		__m128i vsrc0 = _mm_loadu_si128((const __m128i*)src + 0);
		__m128i vsrc1 = _mm_loadu_si128((const __m128i*)src + 1);
		__m128i vsrc2 = _mm_loadu_si128((const __m128i*)src + 2);
		__m128i vdst0 = _mm_or_si128(_mm_shuffle_epi8(vsrc0, s00), _mm_shuffle_epi8(vsrc1, s01));
		__m128i vdst1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vsrc0, s10), _mm_shuffle_epi8(vsrc1, s11)), _mm_shuffle_epi8(vsrc2, s12));
		__m128i vdst2 = _mm_or_si128(_mm_shuffle_epi8(vsrc1, s21), _mm_shuffle_epi8(vsrc2, s22));
		_mm_storeu_si128((__m128i*)dst + 0, _mm_add_epi8(vdst0, vlast0));
		_mm_storeu_si128((__m128i*)dst + 1, _mm_add_epi8(vdst1, vlast1));
		_mm_storeu_si128((__m128i*)dst + 2, _mm_add_epi8(vdst2, vlast2));
		last += inc;
		src += 48;
		dst += 48;
		size -= 48;
	}

	return dst - out;
}

#endif

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#if defined(UPNG__ARCH_ARM64)

static size_t upng__row1_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
	if (size < 16)
	{
		return 0;
	}

	uint8_t* out = dst;
	const uint64_t shuffle64_high = shuffle64 + 0x0808080808080808;
	const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high)));

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708));
	const uint32x4_t zero = vdupq_n_u32(0);

#if defined(UPNG__ARM64_CRYPTO)
	const poly64x2_t k1k2 = { 0x154442bd4, 0x1c6e41596 };
	const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e };
	const poly64_t k5 = { 0x163cd6124 };
	const poly64_t poly_u = { 0x0f7011641 };
	const poly64_t poly_p = { 0x1db710641 };
	const uint64x2_t mask32 = { ~0U, 0 };

	poly64x2_t crc_mul = { 1, 0 };
	poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));

#define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)))
#define UPNG__CLMUL_P128(x,k,value) do {                                                          \
	poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \
	poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k);                                  \
	x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value));                 \
} while (0)

#endif

	if (size >= 64)
	{
#if defined(UPNG__ARM64_CRYPTO)
		poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0)));
		poly128_t x2 = x1;
		poly128_t x3 = x1;
#endif
		while (size >= 64)
		{
			uint32x4_t va = vsetq_lane_u32(a, zero, 0);
			uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
			uint32x4_t vs = zero;

			// process as many 64-byte blocks as possible
			size_t block_count = size / 64;
			block_count = block_count < UPNG__ADLER32_BLOCKS4 ? block_count : UPNG__ADLER32_BLOCKS4;

			for (size_t i = 0; i < block_count; i++)
			{
				// pixel filtering
				uint8x16x4_t vlast = vld1q_u8_x4(last);
				uint8x16x4_t vsrc = vld1q_u8_x4(src);
				uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]);
				uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]);
				uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]);
				uint8x16_t v3 = vsubq_u8(vsrc.val[3], vlast.val[3]);
				uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle);
				uint8x16_t vdst1 = vqtbl1q_u8(v1, shuffle);
				uint8x16_t vdst2 = vqtbl1q_u8(v2, shuffle);
				uint8x16_t vdst3 = vqtbl1q_u8(v3, shuffle);
				uint8x16x4_t vdst = { vdst0, vdst1, vdst2, vdst3 };
				vst1q_u8_x4(dst, vdst);
				last += inc * 4;
				src += 64;
				dst += 64;
				size -= 64;

				// these could use vdotq_u32, but it runs ~2% slower
				uint16x8_t t0, t1, t2, t3;
				vs = vaddq_u32(vs, va);
				va = vpadalq_u16(va, vpaddlq_u8(vdst0));
				vs = vaddq_u32(vs, va);
				va = vpadalq_u16(va, vpaddlq_u8(vdst1));
				vs = vaddq_u32(vs, va);
				va = vpadalq_u16(va, vpaddlq_u8(vdst2));
				vs = vaddq_u32(vs, va);
				va = vpadalq_u16(va, vpaddlq_u8(vdst3));
				t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
				t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul));
				t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul));
				t3 = vmull_u8(vget_low_u8(vdst3), vget_low_u8(cmul));
				t0 = vmlal_high_u8(t0, vdst0, cmul);
				t1 = vmlal_high_u8(t1, vdst1, cmul);
				t2 = vmlal_high_u8(t2, vdst2, cmul);
				t3 = vmlal_high_u8(t3, vdst3, cmul);
				vb = vpadalq_u16(vb, t0);
				vb = vpadalq_u16(vb, t1);
				vb = vpadalq_u16(vb, t2);
				vb = vpadalq_u16(vb, t3);

#if defined(UPNG__ARM64_CRYPTO)
				UPNG__CLMUL_P128(x0, crc_mul, vdst0);
				UPNG__CLMUL_P128(x1, crc_mul, vdst1);
				UPNG__CLMUL_P128(x2, crc_mul, vdst2);
				UPNG__CLMUL_P128(x3, crc_mul, vdst3);
				crc_mul = k1k2;
#elif defined(UPNG__ARM64_CRC32)
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 0));
				crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst3), 1));
#else
				uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
				uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 0) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 1);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
				b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 2) ^ crc;
				b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst3), 3);
				crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
				crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
#endif
			}

			vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
			a = vaddvq_u32(va);
			b = vaddvq_u32(vb);
			a %= UPNG__ADLER32_MOD;
			b %= UPNG__ADLER32_MOD;
		}

#if defined(UPNG__ARM64_CRYPTO)
		// reduce 512-bit to 128-bit
		UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1));
		UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2));
		UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x3));
		crc_mul = k3k4;
#endif
	}

	if (size >= 16)
	{
		uint32x4_t va = vsetq_lane_u32(a, zero, 0);
		uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
		uint32x4_t vs = zero;

		// only 1 to 3 iterations
		while (size >= 16)
		{
			uint8x16_t v0 = vsubq_u8(vld1q_u8(src), vld1q_u8(last));
			uint8x16_t vdst0 = vqtbl1q_u8(v0, shuffle);
			vst1q_u8(dst, vdst0);
			last += inc;
			src += 16;
			dst += 16;
			size -= 16;

			uint16x8_t t0;
			vs = vaddq_u32(vs, va);
			va = vpadalq_u16(va, vpaddlq_u8(vdst0));
			t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
			t0 = vmlal_high_u8(t0, vdst0, cmul);
			vb = vpadalq_u16(vb, t0);

#if defined(UPNG__ARM64_CRYPTO)
			UPNG__CLMUL_P128(x0, crc_mul, vdst0);
			crc_mul = k3k4;
#elif defined(UPNG__ARM64_CRC32)
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
#else
			uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
			uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
#endif
		}

		vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
		a = vaddvq_u32(va);
		b = vaddvq_u32(vb);
		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

#if defined(UPNG__ARM64_CRYPTO)
	// reduce 128-bit to 96-bit
	poly128_t p0;
	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8));
	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1)));
	// reduce 96-bit to 64-bit
	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4));
	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5));
	// reduce 64-bit to 32-bit
	poly128_t x1;
	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u);
	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p);
	crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1);
	#undef UPNG__CLADD_P128
	#undef UPNG__CLMUL_P128
#endif

	idat->adler = a | (b << 16);
	idat->crc = ~crc;

	return dst - out;
}

static size_t upng__row3_arm64(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
	if (size < 48)
	{
		return 0;
	}

	uint8_t* out = dst;
	const uint8x16_t s00 = vld1q_u8(shuffle[0]);
	const uint8x16_t s01 = vld1q_u8(shuffle[1]);
	const uint8x16_t s10 = vld1q_u8(shuffle[2]);
	const uint8x16_t s11 = vld1q_u8(shuffle[3]);
	const uint8x16_t s12 = vld1q_u8(shuffle[4]);
	const uint8x16_t s21 = vld1q_u8(shuffle[5]);
	const uint8x16_t s22 = vld1q_u8(shuffle[6]);

	uint32_t a = idat->adler & 0xffff;
	uint32_t b = idat->adler >> 16;
	uint32_t crc = ~idat->crc;

	const uint8x16_t cmul = vcombine_u8(vcreate_u8(0x090a0b0c0d0e0f10), vcreate_u8(0x0102030405060708));
	const uint32x4_t zero = vdupq_n_u32(0);

#if defined(UPNG__ARM64_CRYPTO)
	const poly64x2_t k1k2 = { 0x03db1ecdc, 0x174359406 };
	const poly64x2_t k3k4 = { 0x1751997d0, 0x0ccaa009e };
	const poly64_t k5 = { 0x163cd6124 };
	const poly64_t poly_u = { 0x0f7011641 };
	const poly64_t poly_p = { 0x1db710641 };
	const uint64x2_t mask32 = { ~0U, 0 };

	poly64x2_t crc_mul = { 1, 0 };
	poly128_t x0 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(crc), vcreate_u64(0)));
	poly128_t x1 = vreinterpretq_p128_u64(vcombine_u64(vcreate_u64(0), vcreate_u64(0)));
	poly128_t x2 = x1;

#define UPNG__CLADD_P128(a, b) vreinterpretq_p128_u8(veorq_u8(vreinterpretq_u8_p128(a), vreinterpretq_u8_p128(b)))
#define UPNG__CLMUL_P128(x,k,value) do {                                                          \
	poly128_t p0 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x), 0), vgetq_lane_p64(k, 0)); \
	poly128_t p1 = vmull_high_p64(vreinterpretq_p64_p128(x), k);                                  \
	x = UPNG__CLADD_P128(UPNG__CLADD_P128(p0, p1), vreinterpretq_p128_u8(value));                 \
} while (0)

#endif

	while (size >= 48)
	{
		uint32x4_t va = vsetq_lane_u32(a, zero, 0);
		uint32x4_t vb = vsetq_lane_u32(b, zero, 0);
		uint32x4_t vs = zero;

		// process as many 3x16-byte blocks as possible
		size_t block_count = size / 48;
		block_count = block_count < UPNG__ADLER32_BLOCKS3 ? block_count : UPNG__ADLER32_BLOCKS3;

		for (size_t i = 0; i < block_count; i++)
		{
			uint8x16x3_t vlast = vld1q_u8_x3(last);
			uint8x16x3_t vsrc = vld1q_u8_x3(src);
			uint8x16_t v0 = vsubq_u8(vsrc.val[0], vlast.val[0]);
			uint8x16_t v1 = vsubq_u8(vsrc.val[1], vlast.val[1]);
			uint8x16_t v2 = vsubq_u8(vsrc.val[2], vlast.val[2]);
			uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(v0, s00), v1, s01);
			uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(v0, s10), v1, s11), v2, s12);
			uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(v1, s21), v2, s22);
			uint8x16x3_t vdst = { vdst0, vdst1, vdst2 };
			vst1q_u8_x3(dst, vdst);
			last += inc;
			src += 48;
			dst += 48;
			size -= 48;

			vs = vaddq_u32(vs, va);
			va = vpadalq_u16(va, vpaddlq_u8(vdst0));
			vs = vaddq_u32(vs, va);
			va = vpadalq_u16(va, vpaddlq_u8(vdst1));
			vs = vaddq_u32(vs, va);
			va = vpadalq_u16(va, vpaddlq_u8(vdst2));

			uint16x8_t t0, t1, t2;
			t0 = vmull_u8(vget_low_u8(vdst0), vget_low_u8(cmul));
			t1 = vmull_u8(vget_low_u8(vdst1), vget_low_u8(cmul));
			t2 = vmull_u8(vget_low_u8(vdst2), vget_low_u8(cmul));
			t0 = vmlal_high_u8(t0, vdst0, cmul);
			t1 = vmlal_high_u8(t1, vdst1, cmul);
			t2 = vmlal_high_u8(t2, vdst2, cmul);
			vb = vpadalq_u16(vb, t0);
			vb = vpadalq_u16(vb, t1);
			vb = vpadalq_u16(vb, t2);

#if defined(UPNG__ARM64_CRYPTO)
			UPNG__CLMUL_P128(x0, crc_mul, vdst0);
			UPNG__CLMUL_P128(x1, crc_mul, vdst1);
			UPNG__CLMUL_P128(x2, crc_mul, vdst2);
			crc_mul = k1k2;
#elif defined(UPNG__ARM64_CRC32)
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 0));
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst0), 1));
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 0));
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst1), 1));
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 0));
			crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(vdst2), 1));
#else
			uint32_t b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 0) ^ crc;
			uint32_t b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 1);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 2) ^ crc;
			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst0), 3);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 0) ^ crc;
			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 1);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 2) ^ crc;
			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst1), 3);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];

			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 0) ^ crc;
			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 1);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
			b0 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 2) ^ crc;
			b1 = vgetq_lane_u32(vreinterpretq_u32_u8(vdst2), 3);
			crc = upng__crc32_table[0][(b1 >> 24) & 0xff] ^ upng__crc32_table[1][(b1 >> 16) & 0xff] ^ upng__crc32_table[2][(b1 >> 8) & 0xff] ^ upng__crc32_table[3][b1 & 0xff];
			crc ^= upng__crc32_table[4][(b0 >> 24) & 0xff] ^ upng__crc32_table[5][(b0 >> 16) & 0xff] ^ upng__crc32_table[6][(b0 >> 8) & 0xff] ^ upng__crc32_table[7][b0 & 0xff];
#endif
		}

		vb = vaddq_u32(vb, vshlq_n_u32(vs, 4));
		a = vaddvq_u32(va);
		b = vaddvq_u32(vb);
		a %= UPNG__ADLER32_MOD;
		b %= UPNG__ADLER32_MOD;
	}

#if defined(UPNG__ARM64_CRYPTO)
	// reduce 384-bit to 128-bit
	UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x1));
	UPNG__CLMUL_P128(x0, k3k4, vreinterpretq_u8_p128(x2));
	// reduce 128-bit to 96-bit
	poly128_t p0;
	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 8));
	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_p128(x0), 0), vgetq_lane_p64(k3k4, 1)));
	// reduce 96-bit to 64-bit
	p0 = vreinterpretq_p128_u8(vextq_u8(vreinterpretq_u8_p128(x0), vdupq_n_u8(0), 4));
	x0 = UPNG__CLADD_P128(p0, vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), k5));
	// reduce 64-bit to 32-bit
	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x0), mask32)), 0), poly_u);
	x1 = vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u64(vandq_u64(vreinterpretq_u64_p128(x1), mask32)), 0), poly_p);
	crc = vgetq_lane_u32(vreinterpretq_u32_p128(UPNG__CLADD_P128(x0, x1)), 1);
	#undef UPNG__CLMUL_P128
	#undef UPNG__CLADD_P128
#endif

	idat->adler = a | (b << 16);
	idat->crc = ~crc;

	return dst - out;
}

static size_t upng__unrow1_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, uint64_t shuffle64)
{
	uint8_t* out = dst;

	uint64_t shuffle64_high = shuffle64 + 0x0808080808080808;
	const uint8x16_t shuffle = vreinterpretq_u8_u64(vcombine_u64(vdup_n_u64(shuffle64), vdup_n_u64(shuffle64_high)));

	while (size >= 16)
	{
		uint8x16_t vdst0 = vqtbl1q_u8(vld1q_u8(src), shuffle);
		vdst0 = vaddq_u8(vdst0, vld1q_u8(last));
		vst1q_u8(dst, vdst0);
		last += inc;
		src += 16;
		dst += 16;
		size -= 16;
	}

	return dst - out;
}

static size_t upng__unrow3_arm64(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, const uint8_t(*shuffle)[16])
{
	uint8_t* out = dst;
	const uint8x16_t s00 = vld1q_u8(shuffle[0]);
	const uint8x16_t s01 = vld1q_u8(shuffle[1]);
	const uint8x16_t s10 = vld1q_u8(shuffle[2]);
	const uint8x16_t s11 = vld1q_u8(shuffle[3]);
	const uint8x16_t s12 = vld1q_u8(shuffle[4]);
	const uint8x16_t s21 = vld1q_u8(shuffle[5]);
	const uint8x16_t s22 = vld1q_u8(shuffle[6]);

	while (size >= 48)
	{
		uint8x16x3_t vsrc = vld1q_u8_x3(src);
		uint8x16_t vsrc0 = vsrc.val[0];
		uint8x16_t vsrc1 = vsrc.val[1];
		uint8x16_t vsrc2 = vsrc.val[2];
		uint8x16_t vdst0 = vqtbx1q_u8(vqtbl1q_u8(vsrc0, s00), vsrc1, s01);
		uint8x16_t vdst1 = vqtbx1q_u8(vqtbx1q_u8(vqtbl1q_u8(vsrc0, s10), vsrc1, s11), vsrc2, s12);
		uint8x16_t vdst2 = vqtbx1q_u8(vqtbl1q_u8(vsrc1, s21), vsrc2, s22);
		uint8x16x3_t vlast = vld1q_u8_x3(last);
		vdst0 = vaddq_u8(vdst0, vlast.val[0]);
		vdst1 = vaddq_u8(vdst1, vlast.val[1]);
		vdst2 = vaddq_u8(vdst2, vlast.val[2]);
		uint8x16x3_t vdst = { vdst0, vdst1, vdst2 };
		vst1q_u8_x3(dst, vdst);
		last += inc;
		src += 48;
		dst += 48;
		size -= 48;
	}

	return dst - out;
}

#endif

#define _ 0xff
#define __ _

// identity shuffle, nothing to rearrange
static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB8[7][16]) =
{
	{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},

	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
	{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},

	{ _,_,_, _,_,_, _,_,_, _,__,__, __,__,__, __},
	{ 0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14, 15},
};

//         0123456789012345
// src0 = [BGRBGRBGRBGRBGRB]
// src1 = [GRBGRBGRBGRBGRBG]
// src2 = [RBGRBGRBGRBGRBGR]
static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR8[7][16]) =
{
	{ 2,1,0, 5,4,3, 8,7,6, 11,10,9, 14,13,12, _},  // RGB RGB RGB RGB RGB _
	{ _,_,_, _,_,_, _,_,_, __,__,_, __,__,__, 1},  // ___ ___ ___ ___ ___ R

	{ _,15, _,_,_, _,_,_, __,_,_, __,__,__, _,__}, // _B ___ ___ ___ ___ __
	{ 0,__, 4,3,2, 7,6,5, 10,9,8, 13,12,11, _,15}, // G_ RGB RGB RGB RGB _G
	{ _,__, _,_,_, _,_,_, __,_,_, __,__,__, 0,__}, // __ ___ ___ ___ ___ R_

	{14, _,_,_, _,_,_, _,_,_, __,__,__, __,__,__}, // B ___ ___ ___ ___ ___
	{__, 3,2,1, 6,5,4, 9,8,7, 12,11,10, 15,14,13}, // _ RGB RGB RGB RGB RGB
};

// only swap bytes in each 16-bit value
static const uint8_t UPNG__ALIGN(16, upng__shuffle_RGB16[7][16]) =
{
	{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},

	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
	{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},

	{_,_, _,_, _,_, _,_, _,_, __,__, __,__, __,__},
	{1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14},
};

//         0123456789012345
// src0 = [BBGGRRBBGGRRBBGG]
// src1 = [RRBBGGRRBBGGRRBB]
// src2 = [GGRRBBGGRRBBGGRR]
static const uint8_t UPNG__ALIGN(16, upng__shuffle_BGR16[7][16]) =
{
	{5,4,3,2,1,0, 11,10,9,8,7,6, _,_,15,14},      // RRGGBB RRGGBB __GG
	{_,_,_,_,_,_, __,__,_,_,_,_, 1,0,__,__},      // ______ ______ RR__

	{13,12, _,_,_,_,_,_, __,__,__,__,_,_, _,_},   // BB ______ ______ __
	{__,__, 7,6,5,4,3,2, 13,12,11,10,9,8, _,_},   // __ RRGGBB RRGGBB __
	{__,__, _,_,_,_,_,_, __,__,__,__,_,_, 3,2},   // __ ______ ______ RR

	{_,_, 15,14, _,_,_,_,_,_, __,__,__,__,__,__}, // __BB ______ ______
	{1,0, __,__, 9,8,7,6,5,4, 15,14,13,12,11,10}, // GG__ RRGGBB RRGGBB
};

#undef _
#undef __

// handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16
static size_t upng__row1(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
	uint64_t shuffle64;
	switch (format)
	{
	case UPNG_FORMAT_G8:
	case UPNG_FORMAT_GA8:
	case UPNG_FORMAT_RGBA8:
		shuffle64 = 0x0706050403020100; // nothing to shuffle, identity
		break;
	case UPNG_FORMAT_BGRA8:
		shuffle64 = 0x0704050603000102; // BGRA to RGBA
		break;
	case UPNG_FORMAT_G16:
	case UPNG_FORMAT_GA16:
	case UPNG_FORMAT_RGBA16:
		shuffle64 = 0x0607040502030001; // swap bytes
		break;
	case UPNG_FORMAT_BGRA16:
		shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA
		break;
	default:
		shuffle64 = 0;
		break;
	};

#if defined(UPNG__ARCH_X64_AVX2)
	int cpuid = upng__cpuid();
	if (cpuid & UPNG__CPUID_CLMUL)
	{
		return upng__row1_avx2(idat, dst, src, last, size, inc, shuffle64);
	}
	else
	{
		return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64);
	}
#elif defined(UPNG__ARCH_X64)
	int cpuid = upng__cpuid();
	if (cpuid & UPNG__CPUID_CLMUL)
	{
		return upng__row1_clmul(idat, dst, src, last, size, inc, shuffle64);
	}
	else if (cpuid & UPNG__CPUID_SSE41)
	{
		return upng__row1_sse4(idat, dst, src, last, size, inc, shuffle64);
	}
#elif defined(UPNG__ARCH_ARM64)
	return upng__row1_arm64(idat, dst, src, last, size, inc, shuffle64);
#else
	(void)shuffle64;
#endif
	return 0;
}

// handles RGB8, BGR8, RGB16, BGR16
static size_t upng__row3(upng__idat* idat, uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
	const uint8_t (*shuffle)[16];
	switch (format)
	{
	case UPNG_FORMAT_RGB8:
		shuffle = upng__shuffle_RGB8;
		break;
	case UPNG_FORMAT_BGR8:
		shuffle = upng__shuffle_BGR8;
		break;
	case UPNG_FORMAT_RGB16:
		shuffle = upng__shuffle_RGB16;
		break;
	case UPNG_FORMAT_BGR16:
		shuffle = upng__shuffle_BGR16;
		break;
	default:
		shuffle = NULL;
		break;
	}

#if defined(UPNG__ARCH_X64)
	int cpuid = upng__cpuid();
	if (cpuid & UPNG__CPUID_CLMUL)
	{
		return upng__row3_clmul(idat, dst, src, last, size, inc, shuffle);
	}
	else if (cpuid & UPNG__CPUID_SSE41)
	{
		return upng__row3_sse4(idat, dst, src, last, size, inc, shuffle);
	}
#elif defined(UPNG__ARCH_ARM64)
	return upng__row3_arm64(idat, dst, src, last, size, inc, shuffle);
#else
	(void)shuffle;
#endif
	return 0;
}

static void upng__row(upng__idat* idat, uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter)
{
	// NONE filter is same as UP when previous row is all 0 values
	static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 };
	const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : src - pitch;

	size_t inc;
	size_t used;
	if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16)
	{
		inc = filter == UPNG_FILTER_NONE ? 0 : 48;
		used = upng__row3(idat, dst, src, last, size, inc, format);
	}
	else
	{
		inc = filter == UPNG_FILTER_NONE ? 0 : 16;
		used = upng__row1(idat, dst, src, last, size, inc, format);
	}

	last += inc == 0 ? 0 : used;
	src += used;
	dst += used;
	size -= used;

	uint8_t* tail = dst;
	size_t tail_size = size;

	// size < inc
	switch (format)
	{
	case UPNG_FORMAT_G8: // 16 pixels per 16 bytes
		inc /= 16; // 0 or 1
		while (size != 0)
		{
			dst[0] = src[0] - last[0];
			last += inc;
			src += 1;
			dst += 1;
			size -= 1;
		}
		break;

	case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes
		inc /= 8; // 0 or 2
		while (size != 0)
		{
			dst[0] = src[0] - last[0];
			dst[1] = src[1] - last[1];
			last += inc;
			src += 2;
			dst += 2;
			size -= 2;
		}
		break;

	case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes
		inc /= 16; // 0 or 3
		while (size != 0)
		{
			dst[0] = src[0] - last[0];
			dst[1] = src[1] - last[1];
			dst[2] = src[2] - last[2];
			last += inc;
			src += 3;
			dst += 3;
			size -= 3;
		}
		break;

	case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes
		inc /= 16; // 0 or 3
		while (size != 0)
		{
			dst[0] = src[2] - last[2];
			dst[1] = src[1] - last[1];
			dst[2] = src[0] - last[0];
			last += inc;
			src += 3;
			dst += 3;
			size -= 3;
		}
		break;

	case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes
		inc /= 4; // 0 or 4
		while (size != 0)
		{
			dst[0] = src[0] - last[0];
			dst[1] = src[1] - last[1];
			dst[2] = src[2] - last[2];
			dst[3] = src[3] - last[3];
			last += inc;
			src += 4;
			dst += 4;
			size -= 4;
		}
		break;

	case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes
		inc /= 4; // 0 or 4
		while (size != 0)
		{
			dst[0] = src[2] - last[2];
			dst[1] = src[1] - last[1];
			dst[2] = src[0] - last[0];
			dst[3] = src[3] - last[3];
			last += inc;
			src += 4;
			dst += 4;
			size -= 4;
		}
		break;


	case UPNG_FORMAT_G16: // 8 pixels per 16 bytes
		inc /= 8; // 0 or 2
		while (size != 0)
		{
			dst[0] = src[1] - last[1];
			dst[1] = src[0] - last[0];
			last += inc;
			src += 2;
			dst += 2;
			size -= 2;
		}
		break;

	case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes
		inc /= 4; // 0 or 4
		while (size != 0)
		{
			dst[0] = src[1] - last[1];
			dst[1] = src[0] - last[0];
			dst[2] = src[3] - last[3];
			dst[3] = src[2] - last[2];
			last += inc;
			src += 4;
			dst += 4;
			size -= 4;
		}
		break;

	case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes
		inc /= 8; // 0 or 6
		while (size != 0)
		{
			dst[0] = src[1] - last[1];
			dst[1] = src[0] - last[0];
			dst[2] = src[3] - last[3];
			dst[3] = src[2] - last[2];
			dst[4] = src[5] - last[5];
			dst[5] = src[4] - last[4];
			last += inc;
			src += 6;
			dst += 6;
			size -= 6;
		}
		break;

	case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes
		inc /= 8; // 0 or 6
		while (size != 0)
		{
			dst[0] = src[5] - last[5];
			dst[1] = src[4] - last[4];
			dst[2] = src[3] - last[3];
			dst[3] = src[2] - last[2];
			dst[4] = src[1] - last[1];
			dst[5] = src[0] - last[0];
			last += inc;
			src += 6;
			dst += 6;
			size -= 6;
		}
		break;

	case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes
		inc /= 2; // 0 or 8
		while (size != 0)
		{
			dst[0] = src[1] - last[1];
			dst[1] = src[0] - last[0];
			dst[2] = src[3] - last[3];
			dst[3] = src[2] - last[2];
			dst[4] = src[5] - last[5];
			dst[5] = src[4] - last[4];
			dst[6] = src[7] - last[7];
			dst[7] = src[6] - last[6];
			last += inc;
			src += 8;
			dst += 8;
			size -= 8;
		}
		break;

	case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes
		inc /= 2; // 0 or 8
		while (size != 0)
		{
			dst[0] = src[5] - last[5];
			dst[1] = src[4] - last[4];
			dst[2] = src[3] - last[3];
			dst[3] = src[2] - last[2];
			dst[4] = src[1] - last[1];
			dst[5] = src[0] - last[0];
			dst[6] = src[7] - last[7];
			dst[7] = src[6] - last[6];
			last += inc;
			src += 8;
			dst += 8;
			size -= 8;
		}
		break;
	}

	idat->adler = upng__adler32(idat->adler, tail, tail_size);
	idat->crc = upng__crc32(idat->crc, tail, tail_size);
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// handles G8, GA8, RGBA8, BGRA8, G16, GA16, BGRA16, RGBA16
static size_t upng__unrow1(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
	uint64_t shuffle64;
	switch (format)
	{
	case UPNG_FORMAT_G8:
	case UPNG_FORMAT_GA8:
	case UPNG_FORMAT_RGBA8:
		shuffle64 = 0x0706050403020100; // nothing to shuffle, identity
		break;
	case UPNG_FORMAT_BGRA8:
		shuffle64 = 0x0704050603000102; // BGRA to RGBA
		break;
	case UPNG_FORMAT_G16:
	case UPNG_FORMAT_GA16:
	case UPNG_FORMAT_RGBA16:
		shuffle64 = 0x0607040502030001; // swap bytes
		break;
	case UPNG_FORMAT_BGRA16:
		shuffle64 = 0x0607000102030405; // swap bytes, BGRA to RGBA
		break;
	default:
		shuffle64 = 0;
		break;
	};

#if defined(UPNG__ARCH_X64_AVX2)
	return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64);
#elif defined(UPNG__ARCH_X64)
	int cpuid = upng__cpuid();
	if (cpuid & UPNG__CPUID_SSE41)
	{
		return upng__unrow1_sse4(dst, src, last, size, inc, shuffle64);
	}
#elif defined(UPNG__ARCH_ARM64)
	return upng__unrow1_arm64(dst, src, last, size, inc, shuffle64);
#else
	(void)shuffle64;
#endif
	return 0;
}

// handles RGB8, BGR8, RGB16, BGR16
static size_t upng__unrow3(uint8_t* dst, const uint8_t* src, const uint8_t* last, size_t size, size_t inc, upng_format format)
{
	const uint8_t(*shuffle)[16];
	switch (format)
	{
	case UPNG_FORMAT_RGB8:
		shuffle = upng__shuffle_RGB8;
		break;
	case UPNG_FORMAT_BGR8:
		shuffle = upng__shuffle_BGR8;
		break;
	case UPNG_FORMAT_RGB16:
		shuffle = upng__shuffle_RGB16;
		break;
	case UPNG_FORMAT_BGR16:
		shuffle = upng__shuffle_BGR16;
		break;
	default:
		shuffle = NULL;
		break;
	}

#if defined(UPNG__ARCH_X64)
	int cpuid = upng__cpuid();
	if (cpuid & UPNG__CPUID_SSE41)
	{
		return upng__unrow3_sse4(dst, src, last, size, inc, shuffle);
	}
#elif defined(UPNG__ARCH_ARM64)
	return upng__unrow3_arm64(dst, src, last, size, inc, shuffle);
#else
	(void)shuffle;
#endif
	return 0;
}

static void upng__unrow(uint8_t* dst, const uint8_t* src, size_t pitch, size_t size, upng_format format, upng_filter filter)
{
	// NONE filter is same as UP when previous row is all 0 values
	static const uint8_t UPNG__ALIGN(64, zero[64]) = { 0 };
	const uint8_t* last = filter == UPNG_FILTER_NONE ? zero : dst - pitch;

	size_t inc;
	size_t used;
	if (format == UPNG_FORMAT_RGB8 || format == UPNG_FORMAT_BGR8 || format == UPNG_FORMAT_RGB16 || format == UPNG_FORMAT_BGR16)
	{
		inc = filter == UPNG_FILTER_NONE ? 0 : 48;
		used = upng__unrow3(dst, src, last, size, inc, format);
	}
	else
	{
		inc = filter == UPNG_FILTER_NONE ? 0 : 16;
		used = upng__unrow1(dst, src, last, size, inc, format);
	}

	last += inc == 0 ? 0 : used;
	src += used;
	dst += used;
	size -= used;

	// size < inc
	switch (format)
	{
	case UPNG_FORMAT_G8: // 16 pixels per 16 bytes
		inc /= 16; // 0 or 1
		while (size != 0)
		{
			dst[0] = src[0] + last[0];
			last += inc;
			src += 1;
			dst += 1;
			size -= 1;
		}
		break;

	case UPNG_FORMAT_GA8: // 8 pixels per 16 bytes
		inc /= 8; // 0 or 2
		while (size != 0)
		{
			dst[0] = src[0] + last[0];
			dst[1] = src[1] + last[1];
			last += inc;
			src += 2;
			dst += 2;
			size -= 2;
		}
		break;

	case UPNG_FORMAT_RGB8: // 16 pixels per 48 bytes
		inc /= 16; // 0 or 3
		while (size != 0)
		{
			dst[0] = src[0] + last[0];
			dst[1] = src[1] + last[1];
			dst[2] = src[2] + last[2];
			last += inc;
			src += 3;
			dst += 3;
			size -= 3;
		}
		break;

	case UPNG_FORMAT_BGR8: // 16 pixels per 48 bytes
		inc /= 16; // 0 or 3
		while (size != 0)
		{
			dst[0] = src[2] + last[0];
			dst[1] = src[1] + last[1];
			dst[2] = src[0] + last[2];
			last += inc;
			src += 3;
			dst += 3;
			size -= 3;
		}
		break;

	case UPNG_FORMAT_RGBA8: // 4 pixels per 16 bytes
		inc /= 4; // 0 or 4
		while (size != 0)
		{
			dst[0] = src[0] + last[0];
			dst[1] = src[1] + last[1];
			dst[2] = src[2] + last[2];
			dst[3] = src[3] + last[3];
			last += inc;
			src += 4;
			dst += 4;
			size -= 4;
		}
		break;

	case UPNG_FORMAT_BGRA8: // 4 pixels per 16 bytes
		inc /= 4; // 0 or 4
		while (size != 0)
		{
			dst[0] = src[2] + last[0];
			dst[1] = src[1] + last[1];
			dst[2] = src[0] + last[2];
			dst[3] = src[3] + last[3];
			last += inc;
			src += 4;
			dst += 4;
			size -= 4;
		}
		break;


	case UPNG_FORMAT_G16: // 8 pixels per 16 bytes
		inc /= 8; // 0 or 2
		while (size != 0)
		{
			dst[0] = src[1] + last[0];
			dst[1] = src[0] + last[1];
			last += inc;
			src += 2;
			dst += 2;
			size -= 2;
		}
		break;

	case UPNG_FORMAT_GA16: // 4 pixels per 16 bytes
		inc /= 4; // 0 or 4
		while (size != 0)
		{
			dst[0] = src[1] + last[0];
			dst[1] = src[0] + last[1];
			dst[2] = src[3] + last[2];
			dst[3] = src[2] + last[3];
			last += inc;
			src += 4;
			dst += 4;
			size -= 4;
		}
		break;

	case UPNG_FORMAT_RGB16: // 8 pixels per 48 bytes
		inc /= 8; // 0 or 6
		while (size != 0)
		{
			dst[0] = src[1] + last[0];
			dst[1] = src[0] + last[1];
			dst[2] = src[3] + last[2];
			dst[3] = src[2] + last[3];
			dst[4] = src[5] + last[4];
			dst[5] = src[4] + last[5];
			last += inc;
			src += 6;
			dst += 6;
			size -= 6;
		}
		break;

	case UPNG_FORMAT_BGR16: // 8 pixels per 48 bytes
		inc /= 8; // 0 or 6
		while (size != 0)
		{
			dst[0] = src[5] + last[0];
			dst[1] = src[4] + last[1];
			dst[2] = src[3] + last[2];
			dst[3] = src[2] + last[3];
			dst[4] = src[1] + last[4];
			dst[5] = src[0] + last[5];
			last += inc;
			src += 6;
			dst += 6;
			size -= 6;
		}
		break;

	case UPNG_FORMAT_RGBA16: // 2 pixels per 16 bytes
		inc /= 2; // 0 or 8
		while (size != 0)
		{
			dst[0] = src[1] + last[0];
			dst[1] = src[0] + last[1];
			dst[2] = src[3] + last[2];
			dst[3] = src[2] + last[3];
			dst[4] = src[5] + last[4];
			dst[5] = src[4] + last[5];
			dst[6] = src[7] + last[6];
			dst[7] = src[6] + last[7];
			last += inc;
			src += 8;
			dst += 8;
			size -= 8;
		}
		break;

	case UPNG_FORMAT_BGRA16: // 2 pixels per 16 bytes
		inc /= 2; // 0 or 8
		while (size != 0)
		{
			dst[0] = src[5] + last[0];
			dst[1] = src[4] + last[1];
			dst[2] = src[3] + last[2];
			dst[3] = src[2] + last[3];
			dst[4] = src[1] + last[4];
			dst[5] = src[0] + last[5];
			dst[6] = src[7] + last[6];
			dst[7] = src[6] + last[7];
			last += inc;
			src += 8;
			dst += 8;
			size -= 8;
		}
		break;
	}
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

static const char upng__sig[] = "\x89PNG\r\n\x1a\n";
static const size_t upng__ihdr_size = 13;

// max chunk size
static const size_t upng__max_chunk_size = (1U << 31) - 1;

static const uint32_t upng__bpp[] =
{
	1, // G8
	2, // GA8
	3, // RGB8
	3, // BGR8
	4, // RGBA8
	4, // BGRA8
	2, // G16
	4, // GA16
	6, // RGB16
	6, // BGR16
	8, // RGBA16
	8, // BGRA16
};

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

size_t upng_write(void* dst, const void* src, uint32_t width, uint32_t height, size_t pitch, upng_format format, upng_filter filter)
{
	if (width == 0 || height == 0)
	{
		// bad width or height
		return 0;
	}

	const uint32_t bpp = upng__bpp[format];
	if (pitch == 0)
	{
		pitch = width * bpp;
	}

	if ((size_t)width * height < (size_t)width || (size_t)width * height >= (size_t)1 << 48)
	{
		// width and height too big
		return 0;
	}

	if ((size_t)pitch * bpp < (size_t)pitch)
	{
		// pitch too large, overflows size_t
		return 0;
	}

	static const char iend_chunk[] = "\0\0\0\0IEND\xae\x42\x60\x82";

	// max zlib block size
	const uint32_t max_block_size = 65535;

	// how many pixels fit into one zlib block (conservative estimate, because of filter byte)
	const uint32_t pixels_per_block = (max_block_size - 1) / bpp;

	// how many full zlib blocks needed per row
	size_t full_block_count = width / pixels_per_block;

	// how many pixels are left
	size_t tail_block_pixels = width % pixels_per_block;

	// how many bytes in full zlib blocks
	size_t full_block_size = full_block_count * (1 + 4 + pixels_per_block * bpp);

	// how many bytes in last zlib block
	size_t last_block_size = tail_block_pixels ? (1 + 4 + tail_block_pixels * bpp) : 0;

	// total size per row including filter byte
	size_t size_per_row = 1 + full_block_size + last_block_size;

	// how many rows fit into IDAT chunk
	size_t rows_per_idat = upng__max_chunk_size / size_per_row;
	if (rows_per_idat == 0)
	{
		// code assumes it can fit at least one full row (including zlib block headers) into IDAT
		return 0;
	}

	if (!dst)
	{
		size_t size = 0;

		// png signature
		size += sizeof(upng__sig) - 1;

		// IHDR chunk
		size += 4 + 4 + upng__ihdr_size + 4;

		// first IDAT chunk contains 2 zlib header bytes
		size += 4 + 4 + 2 + 4;

		// how many full IDAT chunks
		size_t full_idat_count = height / rows_per_idat;

		// how many rows in last IDAT
		size_t tail_idat_rows = height % rows_per_idat;

		size += (4 + 4 + rows_per_idat * size_per_row + 4) * full_idat_count;
		size += tail_idat_rows ? (4 + 4 + tail_idat_rows * size_per_row + 4) : 0;

		// last IDAT chunk with empty zlib block & adler32
		size += 4 + 4 + (1 + 4) + (4) + 4;

		// IEND chunk
		size += sizeof(iend_chunk) - 1;

		return size;
	}

	upng__crc32_init();

	uint8_t* out = (uint8_t*)dst;

	// file signature, https://www.w3.org/TR/png/#5PNG-file-signature
	for (size_t i = 0; i < sizeof(upng__sig) - 1; i++)
	{
		*out++ = (uint8_t)upng__sig[i];
	}

	// IHDR, https://www.w3.org/TR/png/#11IHDR
	{
		// https://www.w3.org/TR/png/#6Colour-values
		static const uint8_t bits[] = { 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16 };
		static const uint8_t type[] = { 0, 4, 2, 2, 6, 6, 0, 4, 2, 2, 6, 6 };

		*out++ = 0;
		*out++ = 0;
		*out++ = 0;
		*out++ = (uint8_t)upng__ihdr_size;
		*out++ = (uint8_t)'I';
		*out++ = (uint8_t)'H';
		*out++ = (uint8_t)'D';
		*out++ = (uint8_t)'R';
		*out++ = (uint8_t)(width >> 24);
		*out++ = (uint8_t)(width >> 16);
		*out++ = (uint8_t)(width >> 8);
		*out++ = (uint8_t)(width);
		*out++ = (uint8_t)(height >> 24);
		*out++ = (uint8_t)(height >> 16);
		*out++ = (uint8_t)(height >> 8);
		*out++ = (uint8_t)(height);
		*out++ = bits[format];
		*out++ = type[format];
		*out++ = 0; // zlib compression
		*out++ = 0; // filter method
		*out++ = 0; // no interlace
		uint32_t crc = upng__crc32(UPNG__CRC32_INIT, out - upng__ihdr_size - 4, upng__ihdr_size + 4);
		*out++ = (uint8_t)(crc >> 24);
		*out++ = (uint8_t)(crc >> 16);
		*out++ = (uint8_t)(crc >> 8);
		*out++ = (uint8_t)(crc);
	}

	// first IDAT contains just 2 bytes of zlib format
	{
		*out++ = 0;
		*out++ = 0;
		*out++ = 0;
		*out++ = 2;
		*out++ = (uint8_t)'I';
		*out++ = (uint8_t)'D';
		*out++ = (uint8_t)'A';
		*out++ = (uint8_t)'T';
		*out++ = 0x78; // CM=8, CINFO=7
		*out++ = 0x01; // FCHECK=1, FDICT=0, FLEVEL=0
		uint32_t crc = 0xec1a7ed2; // crc32(out - 6, 6)
		*out++ = (uint8_t)(crc >> 24);
		*out++ = (uint8_t)(crc >> 16);
		*out++ = (uint8_t)(crc >> 8);
		*out++ = (uint8_t)(crc);
	}

	upng__idat idat;
	idat.adler = UPNG__ADLER32_INIT;

	for (size_t y0 = 0; y0 < height; y0 += rows_per_idat)
	{
		size_t rows_in_idat = (height - y0) < rows_per_idat ? (height - y0) : rows_per_idat;

		uint32_t idat_size = (uint32_t)(rows_in_idat * size_per_row);

		// start of IDAT, https://www.w3.org/TR/png/#11IDAT
		*out++ = (uint8_t)(idat_size >> 24);
		*out++ = (uint8_t)(idat_size >> 16);
		*out++ = (uint8_t)(idat_size >> 8);
		*out++ = (uint8_t)(idat_size);
		*out++ = (uint8_t)'I';
		*out++ = (uint8_t)'D';
		*out++ = (uint8_t)'A';
		*out++ = (uint8_t)'T';
		idat.crc = 0x35af061e; // crc32(out - 4, 4)

		for (size_t yi = 0; yi < rows_in_idat; yi++)
		{
			size_t y = y0 + yi;

			// every row will always start on a new zlib block
			for (size_t x = 0; x < width; x += pixels_per_block)
			{
				// how many pixels to use
				uint32_t pixel_count = (uint32_t)((width - x) < pixels_per_block ? (width - x) : pixels_per_block);
				uint32_t pixel_size = pixel_count * bpp;

				// include filter byte
				uint32_t block_size = (x == 0 ? 1 : 0) + pixel_size;

				*out++ = 0; // BFINAL=0, BTYPE=0
				*out++ = (uint8_t)(block_size);
				*out++ = (uint8_t)(block_size >> 8);
				*out++ = (uint8_t)(~block_size);
				*out++ = (uint8_t)(~block_size >> 8);
				idat.crc = upng__crc32(idat.crc, out - 5, 5);

				// first row uses NONE, rest of them UP/NONE filter
				upng_filter row_filter = y == 0 ? UPNG_FILTER_NONE : filter;
				if (x == 0)
				{
					// each row starts with filter byte
					// https://www.w3.org/TR/png/#9Filter-types
					*out++ = (uint8_t)row_filter;
					idat.adler = upng__adler32(idat.adler, out - 1, 1);
					idat.crc = upng__crc32(idat.crc, out - 1, 1);
				}

				const uint8_t* pix = (const uint8_t*)src + y * pitch + x * bpp;
				upng__row(&idat, out, pix, pitch, pixel_size, format, row_filter);
				out += pixel_size;
			}
		}

		// end of IDAT
		*out++ = (uint8_t)(idat.crc >> 24);
		*out++ = (uint8_t)(idat.crc >> 16);
		*out++ = (uint8_t)(idat.crc >> 8);
		*out++ = (uint8_t)(idat.crc);
	}

	// one more IDAT with empty zlib block (1+4 bytes) and adler32 checksum (4 bytes)
	{
		uint32_t idat_size = 1 + 4 + 4;
		*out++ = 0;
		*out++ = 0;
		*out++ = 0;
		*out++ = (uint8_t)idat_size;
		*out++ = (uint8_t)'I';
		*out++ = (uint8_t)'D';
		*out++ = (uint8_t)'A';
		*out++ = (uint8_t)'T';
		*out++ = 1; // BFINAL=1, BTYPE=0
		*out++ = 0; // LEN = 0x0000
		*out++ = 0;
		*out++ = 0xff; // NLEN = ~LEN = 0xffff
		*out++ = 0xff;
		*out++ = (uint8_t)(idat.adler >> 24);
		*out++ = (uint8_t)(idat.adler >> 16);
		*out++ = (uint8_t)(idat.adler >> 8);
		*out++ = (uint8_t)(idat.adler);
		uint32_t crc = upng__crc32(UPNG__CRC32_INIT, out - (4 + idat_size), 4 + idat_size);
		*out++ = (uint8_t)(crc >> 24);
		*out++ = (uint8_t)(crc >> 16);
		*out++ = (uint8_t)(crc >> 8);
		*out++ = (uint8_t)(crc);
	}

	// IEND, https://www.w3.org/TR/png/#11IEND
	for (size_t i = 0; i < sizeof(iend_chunk) - 1; i++)
	{
		*out++ = (uint8_t)iend_chunk[i];
	}

	return out - (uint8_t*)dst;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

typedef struct {
	const uint8_t* in;
	size_t size;

	// how many bytes available in current IDAT chunk
	uint32_t chunk_size;

	// how many bytes available in current zlib block
	uint32_t block_size;

	// BFINAL bit of last zlib block
	int bfinal;

	// temporary buffer used to read data that could cross IDAT chunk boundaries
	// 2 bytes for zlib format header in the beginning
	// 4 bytes adler32 checksum at the end
	// 5 bytes for zlib block header
	uint8_t temp[5];
	uint32_t temp_size;

	// max 8 bytes of one pixel (RGBA16) that could potentially be split across zlib blocks
	uint8_t split_block[8];
	uint32_t split_size;
} upng__data;

#define UPNG__CHUNK_IHDR 0x52444849
#define UPNG__CHUNK_IDAT 0x54414449
#define UPNG__CHUNK_IEND 0x444e4549

// parses next chunk from input data, returns 4 bytes of chunk type, sets "chunk_size" in "data"
static uint32_t upng__next_chunk(upng__data* data)
{
	// skip crc32 of previous chunk
	data->in += 4;
	data->size -= 4;

	if (data->size < 4 + 4 + 4)
	{
		// not enough input for chunk size/type/crc
		return 0;
	}
	const uint8_t* in = data->in;

	data->chunk_size = (in[0] << 24) | (in[1] << 16) | (in[2] << 8) | in[3];
	data->in += 4;
	data->size -= 4;

	// chunk type
	data->in += 4;
	data->size -= 4;

	if (data->chunk_size > upng__max_chunk_size || data->size < data->chunk_size + 4)
	{
		// bad chunk size, or not enough input provided
		return 0;
	}

	return in[4] | (in[5] << 8) | (in[6] << 16) | (in[7] << 24);
}

// provides exactly "size" bytes from IDAT chunk payload bytes
static const uint8_t* upng__data_expect(upng__data* data, uint32_t size)
{
	if (size <= data->chunk_size)
	{
		// there are enough bytes in current IDAT chunk
		return data->in;
	}

	// otherwise data is split across multiple IDAT chunks
	uint32_t temp_size = 0;
	for (;;)
	{
		// copy available bytes to temp buffer
		uint32_t avail = size < data->chunk_size ? (uint32_t)size : data->chunk_size;
		for (size_t i = 0; i < avail; i++)
		{
			data->temp[temp_size++] = *data->in++;
		}
		size -= avail;
		data->size -= avail;
		data->chunk_size -= avail;

		if (size == 0)
		{
			break;
		}

		// find next non-empty IDAT chunk
		while (data->chunk_size == 0)
		{
			if (upng__next_chunk(data) != UPNG__CHUNK_IDAT)
			{
				// not enough input
				return NULL;
			}
		}

		if (temp_size == 0 && size <= data->chunk_size)
		{
			// amount of bytes requested fully fit into first chunk (because current one was empty)
			return data->in;
		}
	}
	data->temp_size = temp_size;
	return data->temp;
}

// consumes "size" bytes from IDAT chunk payload
static void upng__data_use(upng__data* data, uint32_t size)
{
	if (data->temp_size == 0)
	{
		// consume bytes from IDAT chunk payload
		data->in += size;
		data->size -= size;
		data->chunk_size -= size;
	}
	else
	{
		// consume bytes from "temp"
		data->temp_size = 0;
	}
}

// provides at least "min_size" bytes from zlib block payload, returns actually usable amount in "size"
static const uint8_t* upng__block_expect(upng__data* data, uint32_t min_size, uint32_t* size)
{
	uint32_t avail_size = data->block_size < data->chunk_size ? data->block_size : data->chunk_size;
	if (min_size <= avail_size)
	{
		// there are enough bytes
		*size = avail_size;
		return data->in;
	}

	// otherwise data is split across multiple zlib blocks
	uint32_t split_size = 0;
	for (;;)
	{
		// copy available bytes into split_block
		uint32_t avail = min_size < avail_size ? min_size : avail_size;
		for (size_t i = 0; i < avail; i++)
		{
			data->split_block[split_size++] = data->in[i];
		}
		upng__data_use(data, avail);
		data->block_size -= avail;
		min_size -= avail;
		avail_size -= avail;

		if (min_size == 0)
		{
			break;
		}

		// in case zlib block spans multiple IDAT chunks, read next IDAT chunk
		if (data->chunk_size == 0)
		{
			// find next non-empty IDAT chunk
			while (data->chunk_size == 0)
			{
				if (upng__next_chunk(data) != UPNG__CHUNK_IDAT)
				{
					// not enough input
					return NULL;
				}
			}
		}

		// find next non-empty zlib block
		while (data->block_size == 0)
		{
			if (data->bfinal)
			{
				// after BFINAL=1 there are no more zlib blocks expected
				return NULL;
			}

			const uint8_t* in;
			if (!(in = upng__data_expect(data, 5)))
			{
				// not enough input
				return NULL;
			}

			uint8_t block_type = in[0];
			if ((block_type >> 1) != 0)
			{
				// upng supports only uncompressed zlib blocks (BTYPE=0)
				return NULL;
			}

			uint16_t block_size = in[1] | (in[2] << 8);
			uint16_t size_check = in[3] | (in[4] << 8);
			if ((uint16_t)block_size != (uint16_t)~size_check)
			{
				// bad zlib block size (LEN/NLEN)
				return NULL;
			}
			upng__data_use(data, 5);

			data->bfinal = block_type & 1;
			data->block_size = block_size;
		}
		avail_size = data->block_size < data->chunk_size ? data->block_size : data->chunk_size;

		if (split_size == 0 && min_size <= avail_size)
		{
			// requested amount of bytes fully fit into first block (becuase current one was empty)
			*size = avail_size;
			return data->in;
		}
	}
	data->split_size = split_size;
	*size = split_size;
	return data->split_block;
}

// consumes "size" bytes from zlib block payload
static void upng__block_use(upng__data* data, uint32_t size)
{
	if (data->split_size  == 0)
	{
		// consume bytes from zlib block
		upng__data_use(data, size);
		data->block_size -= size;
	}
	else
	{
		// consume bytes from "split_block"
		data->split_size = 0;
	}
}

size_t upng_read(void* dst, const void* src, size_t size, uint32_t* width, uint32_t* height, upng_format* format, size_t pitch, uint32_t flags)
{
	const uint8_t* in = (const uint8_t*)src;

	// png signature
	{
		if (size < sizeof(upng__sig) - 1)
		{
			// not enough input bytes for png signature
			return 0;
		}

		for (size_t i = 0; i < sizeof(upng__sig) - 1; i++)
		{
			if (in[i] != (uint8_t)upng__sig[i])
			{
				// bad png signature
				return 0;
			}
		}
		in += sizeof(upng__sig) - 1;
		size -= sizeof(upng__sig) - 1;
	}

	uint32_t w, h;
	upng_format fmt;

	upng__data data = { 0 };

	// back up by 4 bytes, because upng__next_chunk will skip crc32
	data.in = in - 4;
	data.size = size + 4;

	// IHDR chunk
	{
		if (upng__next_chunk(&data) != UPNG__CHUNK_IHDR)
		{
			// first chunk must be IHDR
			return 0;
		}

		if (data.chunk_size != upng__ihdr_size)
		{
			// bad IHDR size
			return 0;
		}

		const uint8_t* in = data.in;
		w = (in[0] << 24) | (in[1] << 16) | (in[2] << 8) | in[3];
		h = (in[4] << 24) | (in[5] << 16) | (in[6] << 8) | in[7];
		uint8_t bits = in[8];
		uint8_t type = in[9];
		uint8_t compression = in[10];
		uint8_t filter = in[11];
		uint8_t interlace = in[12];
		data.in += upng__ihdr_size;
		data.size -= upng__ihdr_size;

		if (w == 0 || h == 0					// invalid width or height values
			|| (size_t)w * h < (size_t)w		// width * height overflows
			|| (size_t)w * h >(size_t)1 << 48)	// width * height takes too much memory
		{
			return 0;
		}

		int bgr = !!(flags & UPNG_READ_SWAP_TO_BGR);

		if (bits == 8)
		{
			switch (type)
			{
			case 0: fmt = UPNG_FORMAT_G8; break;
			case 2: fmt = bgr ? UPNG_FORMAT_BGR8 : UPNG_FORMAT_RGB8; break;
			case 4: fmt = UPNG_FORMAT_GA8; break;
			case 6: fmt = bgr ? UPNG_FORMAT_BGRA8 : UPNG_FORMAT_RGBA8; break;
			default: return 0; // unsupported 8-bit format
			}
		}
		else if (bits == 16)
		{
			switch (type)
			{
			case 0: fmt = UPNG_FORMAT_G16; break;
			case 2: fmt = bgr ? UPNG_FORMAT_BGR16 : UPNG_FORMAT_RGB16; break;
			case 4: fmt = UPNG_FORMAT_GA16; break;
			case 6: fmt = bgr ? UPNG_FORMAT_BGRA16 : UPNG_FORMAT_RGBA16; break;
			default: return 0; // unsupported 16-bit format
			}
		}
		else
		{
			// unsupported bit count
			return 0;
		}
		if (compression != 0)
		{
			// unsupported compression method
			return 0;
		}
		if (filter != 0)
		{
			// unsupported filter method
			return 0;
		}
		if (interlace != 0)
		{
			// unsupported interlace method
			return 0;
		}

		*width = w;
		*height = h;
		*format = fmt;
	}

	const uint32_t bpp = upng__bpp[fmt];
	if (pitch == 0)
	{
		if ((size_t)w * bpp < (size_t)bpp)
		{
			// width too large, overflows size_t
			return 0;
		}

		pitch = (size_t)w * bpp;
	}

	if (h * pitch < pitch)
	{
		// pitch too large, overflows size_t
		return 0;
	}

	if (dst == NULL)
	{
		// done! only IHDR info requested
		return h * pitch;
	}

	// skip chunks until first IDAT chunk
	for (;;)
	{
		uint32_t type = upng__next_chunk(&data);
		if (type == UPNG__CHUNK_IDAT)
		{
			break;
		}
		else if ((char)type < 'a' || (char)type > 'z')
		{
			// only "non-critical" chunks allowed
			return 0;
		}
		// ignore optional chunk payload
		data.in += data.chunk_size;
		data.size -= data.chunk_size;
	}

	// IDAT chunk payload starts with 2 bytes for zlib format
	{
		if (!(in = upng__data_expect(&data, 2)))
		{
			// not enough input
			return 0;
		}

		uint32_t cmf = in[0]; // CM & CINFO
		uint32_t flg = in[1]; // FCHECK, FDICT, FLEVEL

		if ((cmf & 0xf) != 0x8)
		{
			// CM must be 8 = deflate compression method
			return 0;
		}

		if (flg & (1 << 5))
		{
			// FDICT must be 0 = no dictionary
			return 0;
		}

		if ((cmf * 256 + flg) % 31 != 0)
		{
			// bad FCHECK value
			return 0;
		}

		upng__data_use(&data, 2);
	}

	for (size_t y = 0; y < h; y++)
	{
		uint32_t in_avail;
		if (!(in = upng__block_expect(&data, 1, &in_avail)))
		{
			// not enough input for row filter byte
			return 0;
		}

		uint8_t row_filter = in[0];
		if (row_filter != UPNG_FILTER_NONE && row_filter != UPNG_FILTER_UP)
		{
			// upng supports only NONE and UP filters
			return 0;
		}
		upng__block_use(&data, 1);

		if (row_filter == UPNG_FILTER_UP && y == 0)
		{
			// if first row uses UP filter, force it to use NONE
			row_filter = UPNG_FILTER_NONE;
		}

		uint32_t x = 0;
		while (x < w)
		{
			if (!(in = upng__block_expect(&data, bpp, &in_avail)))
			{
				// not enough input for at least one more pixel
				return 0;
			}

			uint32_t pixel_count = in_avail / bpp;
			pixel_count = w - x < pixel_count ? w - x : pixel_count;
			uint32_t pixel_size = pixel_count * bpp;

			uint8_t* pix_out = (uint8_t*)dst + y * pitch + x * bpp;
			upng__unrow(pix_out, in, pitch, pixel_size, fmt, (upng_filter)row_filter);

			x += pixel_count;
			upng__block_use(&data, pixel_size);
		}
	}

	if (data.block_size != 0)
	{
		// no more bytes expected in zlib payload
		return 0;
	}

	// expect empty zlib blocks until BFINAL=1
	while (data.bfinal == 0)
	{
		if (!(in = upng__data_expect(&data, 5)))
		{
			// not enough input
			return 0;
		}

		uint8_t block_type = in[0];
		if ((block_type >> 1) != 0)
		{
			// upng supports only uncompressed zlib blocks (BTYPE=0)
			return 0;
		}

		uint16_t block_size = in[1] | (in[2] << 8);
		uint16_t size_check = in[3] | (in[4] << 8);
		if (block_size != 0 || size_check != 0xffff)
		{
			// expected 0-sized zlib block
			return 0;
		}
		upng__data_use(&data, 5);

		data.bfinal = block_type & 1;
	}

	// skip adler32 checksum
	if (!(in = upng__data_expect(&data, 4)))
	{
		return 0;
	}
	upng__data_use(&data, 4);

	if (data.chunk_size != 0)
	{
		// no more bytes expected in IDAT chunks
		return 0;
	}

	// skip until IEND chunk
	for (;;)
	{
		uint32_t type = upng__next_chunk(&data);
		if (type == UPNG__CHUNK_IEND)
		{
			break;
		}
		else if ((char)type < 'a' || (char)type > 'z')
		{
			// only "non-critical" chunks allowed
			return 0;
		}
		// ignore optional chunk payload
		data.in += data.chunk_size;
		data.size -= data.chunk_size;
	}

	if (data.chunk_size != 0)
	{
		// IEND chunk size must be 0
		return 0;
	}

	// ignore crc32 of IEND chunk
	if (data.size != 4)
	{
		// unexpected length of input
		return 0;
	}

	// OK!
	return h * pitch;
}