etemesi254/enc_fast_lossless.cpp

## enc_fast_lossless.cpp
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#ifndef FJXL_SELF_INCLUDE
#define FJXL_SELF_INCLUDE

#include "enc_fast_lossless.h"

#include <assert.h>
#include <stdint.h>
#include <string.h>

#include <algorithm>
#include <array>
#include <limits>
#include <memory>
#include <vector>

namespace {
/// Determine what CPU architecture type we are in
#ifdef __BYTE_ORDER__
#define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#else
#define CPU_IS_LITTLE_ENDIAN 1
#endif

#define FJXL_INLINE

#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>

FJXL_INLINE uint32_t ff_log(uint32_t v)
{
    unsigned long index;
    _BitScanReverse(&index, v);
    return index;
}
// Compiles to a memcpy on little-endian systems.
FJXL_INLINE void StoreLe64(uint8_t *tgt, uint64_t data)
{

    if (CPU_IS_LITTLE_ENDIAN())
        memcpy(tgt, &data, 8);
    else {
        // swap and write
        uint64_t val = _byteswap_uint64(data);
        memcpy(tgt, &val, 8);
    }
}
#else

FJXL_INLINE uint32_t ff_log(uint32_t v)
{
    return  31 - __builtin_clz(v|1);
}
// Compiles to a memcpy on little-endian systems.
FJXL_INLINE void StoreLe64(uint8_t *tgt, uint64_t data)
{

    if (CPU_IS_LITTLE_ENDIAN())
        memcpy(tgt, &data, 8);
    else {
        // swap and write
        uint64_t val = __builtin_bswap64(data);
        memcpy(tgt, &val, 8);
    }
}
#endif

// Add bits to the byte buffer
// Returns the number of bytes in the buffer
//
// Parameters:
// nbits: Number of bits to add
// bits: Where we are going to pull bits from
// data_buf: Where we will write out buffer
// bits_in_buffer: Current number of bits in the buffer
// bit_buffer: buffer we are going to write to
FJXL_INLINE size_t AddBits(uint32_t nbits, uint64_t bits, uint8_t *data_buf, size_t &bits_in_buffer, uint64_t &bit_buffer)
{
    // add bytes
    bit_buffer |= bits << bits_in_buffer;
    bits_in_buffer += nbits;

    StoreLe64(data_buf, bit_buffer);

    size_t bytes_in_buffer = bits_in_buffer / 8;

    bits_in_buffer -= bytes_in_buffer * 8;
    bit_buffer >>= bytes_in_buffer * 8;

    return bytes_in_buffer;
}

// A simple bit writer
// that writes in Little Endian
struct BitWriter {

    size_t bytes_written = 0;
    size_t bits_in_buffer = 0;
    uint64_t buffer = 0;

    std::unique_ptr<uint8_t[], void (*)(void *)> data = {nullptr, free};

    /// Allocate enough space to store maximum_bit_size bits
    void Allocate(size_t maximum_bit_size)
    {
        assert(data == nullptr);
        // Leave some padding.
        data.reset(static_cast<uint8_t *>(malloc(maximum_bit_size / 8 + 64)));
    }

    /// Add some bytes into this bif buffer and flush to the output
    void Write(uint32_t count, uint64_t bits)
    {
        bytes_written += AddBits(count, bits, data.get() + bytes_written,
                                 bits_in_buffer, buffer);
    }
    /// Zero pad the bytes in the bit buffer
    void ZeroPadToByte()
    {
        if (bits_in_buffer != 0) {
            Write(8 - bits_in_buffer, 0);
        }
    }

    FJXL_INLINE void WriteMultiple(const uint64_t *nbits, const uint64_t *bits, size_t n)
    {
        // Necessary because Write() is only guaranteed to work with <=56 bits.
        // Trying to SIMD-fy this code results in lower speed (and definitely less
        // clarity).
        {
            for (size_t i = 0; i < n; i++) {
                this->buffer |= bits[i] << this->bits_in_buffer;
                memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
                uint64_t shift = 64 - this->bits_in_buffer;
                this->bits_in_buffer += nbits[i];
                // This `if` seems to be faster than using ternaries.
                if (this->bits_in_buffer >= 64) {
                    uint64_t next_buffer = bits[i] >> shift;
                    this->buffer = next_buffer;
                    this->bits_in_buffer -= 64;
                    this->bytes_written += 8;
                }
            }
            memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
            size_t bytes_in_buffer = this->bits_in_buffer / 8;
            this->bits_in_buffer -= bytes_in_buffer * 8;
            this->buffer >>= bytes_in_buffer * 8;
            this->bytes_written += bytes_in_buffer;
        }
    }
};

} // namespace

extern "C" {

struct JxlFastLosslessFrameState {
    // Width of the image
    size_t width;
    // Height of the image
    size_t height;
    // Number of channels present in image
    size_t nb_chans;
    // depth of image
    size_t bitdepth;
    // BitWriter used to write the header
    BitWriter header;
    std::vector<std::array<BitWriter, 4> > group_data;
    size_t current_bit_writer = 0;
    size_t bit_writer_byte_pos = 0;
    size_t bits_in_buffer = 0;
    uint64_t bit_buffer = 0;
};

// Return the output in bytes of the frame we have written
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState *frame)
{
    size_t total_size_groups = 0;

    for (size_t i = 0; i < frame->group_data.size(); i++) {
        size_t sz = 0;
        for (size_t j = 0; j < frame->nb_chans; j++) {
            const auto &writer = frame->group_data[i][j];

            sz += writer.bytes_written * 8 + writer.bits_in_buffer;
        }
        sz = (sz + 7) / 8;
        total_size_groups += sz;
    }
    return frame->header.bytes_written + total_size_groups;
}

/// Return the maxiumum required output size for the frame
size_t JxlFastLosslessMaxRequiredOutput(
    const JxlFastLosslessFrameState *frame)
{
    return JxlFastLosslessOutputSize(frame) + 32;
}

void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState *frame,
                                  int add_image_header,
                                  int is_last)
{
    BitWriter *output = &frame->header;
    output->Allocate(1000 + frame->group_data.size() * 32);

    std::vector<size_t> group_sizes(frame->group_data.size());
    for (size_t i = 0; i < frame->group_data.size(); i++) {
        size_t sz = 0;
        for (size_t j = 0; j < frame->nb_chans; j++) {
            const auto &writer = frame->group_data[i][j];
            sz += writer.bytes_written * 8 + writer.bits_in_buffer;
        }
        sz = (sz + 7) / 8;
        group_sizes[i] = sz;
    }

    bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);

    if (add_image_header) {
        // Signature
        output->Write(16, 0x0AFF);
        // Size header, hand-crafted.
        // Not small
        output->Write(1, 0);

        auto wsz = [output](size_t size) {
            if (size - 1 < (1 << 9)) {
                output->Write(2, 0b00);
                output->Write(9, size - 1);
            } else if (size - 1 < (1 << 13)) {
                output->Write(2, 0b01);
                output->Write(13, size - 1);
            } else if (size - 1 < (1 << 18)) {
                output->Write(2, 0b10);
                output->Write(18, size - 1);
            } else {
                output->Write(2, 0b11);
                output->Write(30, size - 1);
            }
        };

        wsz(frame->height);

        // No special ratio.
        output->Write(3, 0);

        wsz(frame->width);

        // Hand-crafted ImageMetadata.
        output->Write(1, 0); // all_default
        output->Write(1, 0); // extra_fields
        output->Write(1, 0); // bit_depth.floating_point_sample

        if (frame->bitdepth == 8) {
            output->Write(2, 0b00); // bit_depth.bits_per_sample = 8
        } else if (frame->bitdepth == 10) {
            output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
        } else if (frame->bitdepth == 12) {
            output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
        } else {
            output->Write(2, 0b11); // 1 + u(6)
            output->Write(6, frame->bitdepth - 1);
        }
        if (frame->bitdepth <= 14) {
            output->Write(1, 1); // 16-bit-buffer sufficient
        } else {
            output->Write(1, 0); // 16-bit-buffer NOT sufficient
        }
        if (have_alpha) {
            output->Write(2, 0b01); // One extra channel
            output->Write(1, 1);    // ... all_default (ie. 8-bit alpha)
        } else {
            output->Write(2, 0b00); // No extra channel
        }
        output->Write(1, 0); // Not XYB
        if (frame->nb_chans > 1) {
            output->Write(1, 1); // color_encoding.all_default (sRGB)
        } else {
            output->Write(1, 0);    // color_encoding.all_default false
            output->Write(1, 0);    // color_encoding.want_icc false
            output->Write(2, 1);    // grayscale
            output->Write(2, 1);    // D65
            output->Write(1, 0);    // no gamma transfer function
            output->Write(2, 0b10); // tf: 2 + u(4)
            output->Write(4, 11);   // tf of sRGB
            output->Write(2, 1);    // relative rendering intent
        }
        output->Write(2, 0b00); // No extensions.

        output->Write(1, 1); // all_default transform data

        // No ICC, no preview. Frame should start at byte boundery.
        output->ZeroPadToByte();
    }

    // Handcrafted frame header.
    output->Write(1, 0);    // all_default
    output->Write(2, 0b00); // regular frame
    output->Write(1, 1);    // modular
    output->Write(2, 0b00); // default flags
    output->Write(1, 0);    // not YCbCr
    output->Write(2, 0b00); // no upsampling
    if (have_alpha) {
        output->Write(2, 0b00); // no alpha upsampling
    }
    output->Write(2, 0b01); // default group size
    output->Write(2, 0b00); // exactly one pass
    output->Write(1, 0);    // no custom size or origin
    output->Write(2, 0b00); // kReplace blending mode
    if (have_alpha) {
        output->Write(2, 0b00); // kReplace blending mode for alpha channel
    }
    output->Write(1, is_last); // is_last
    output->Write(2, 0b00);    // a frame has no name
    output->Write(1, 0);       // loop filter is not all_default
    output->Write(1, 0);       // no gaborish
    output->Write(2, 0);       // 0 EPF iters
    output->Write(2, 0b00);    // No LF extensions
    output->Write(2, 0b00);    // No FH extensions

    output->Write(1, 0);     // No TOC permutation
    output->ZeroPadToByte(); // TOC is byte-aligned.
    for (size_t i = 0; i < frame->group_data.size(); i++) {
        size_t sz = group_sizes[i];
        if (sz < (1 << 10)) {
            output->Write(2, 0b00);
            output->Write(10, sz);
        } else if (sz - 1024 < (1 << 14)) {
            output->Write(2, 0b01);
            output->Write(14, sz - 1024);
        } else if (sz - 17408 < (1 << 22)) {
            output->Write(2, 0b10);
            output->Write(22, sz - 17408);
        } else {
            output->Write(2, 0b11);
            output->Write(30, sz - 4211712);
        }
    }
    output->ZeroPadToByte(); // Groups are byte-aligned.
}

size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState *frame,
                                  unsigned char *output,
                                  size_t output_size)
{
    assert(output_size >= 32);
    unsigned char *initial_output = output;
    size_t (*append_bytes_with_bit_offset)(const uint8_t *, size_t, size_t,
                                           unsigned char *, uint64_t &)
        = nullptr;

    while (true) {
        size_t &cur = frame->current_bit_writer;
        size_t &bw_pos = frame->bit_writer_byte_pos;
        if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
            return output - initial_output;
        }
        if (output_size <= 8) {
            return output - initial_output;
        }
        size_t nbc = frame->nb_chans;
        const BitWriter &writer = cur == 0 ? frame->header : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
        size_t full_byte_count = std::min(output_size - 8, writer.bytes_written - bw_pos);
        if (frame->bits_in_buffer == 0) {
            memcpy(output, writer.data.get() + bw_pos, full_byte_count);
        } else {
            size_t i = 0;

            if (append_bytes_with_bit_offset) {
                i += append_bytes_with_bit_offset(
                    writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
                    output, frame->bit_buffer);
            }

            for (; i < full_byte_count; i++) {
                AddBits(8, writer.data.get()[bw_pos + i], output + i,
                        frame->bits_in_buffer, frame->bit_buffer);
            }
        }
        output += full_byte_count;
        output_size -= full_byte_count;
        bw_pos += full_byte_count;

        if (bw_pos == writer.bytes_written) {
            auto write = [&](size_t num, uint64_t bits) {
                size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
                                   frame->bit_buffer);
                output += n;
                output_size -= n;
            };
            if (writer.bits_in_buffer) {
                write(writer.bits_in_buffer, writer.buffer);
            }
            bw_pos = 0;
            cur++;

            if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
                write(8 - frame->bits_in_buffer, 0);
            }
        }
    }
}

void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState *frame)
{
    delete frame;
}

} // extern "C"

#endif

#ifdef FJXL_SELF_INCLUDE

namespace {

constexpr size_t kNumRawSymbols = 19;
constexpr size_t kNumLZ77 = 33;
constexpr size_t kLZ77CacheSize = 32;

constexpr size_t kLZ77Offset = 224;
constexpr size_t kLZ77MinLength = 7;

void EncodeHybridUintLZ77(uint32_t value, uint32_t *token, uint32_t *nbits, uint32_t *bits)
{
    // 400 config
    uint32_t n = ff_log(value);
    *token = value < 16 ? value : 16 + n - 4;
    *nbits = value < 16 ? 0 : n;
    *bits = value < 16 ? 0 : value - (1 << *nbits);
}

struct PrefixCode {
    uint8_t raw_nbits[kNumRawSymbols] = {};
    uint8_t raw_bits[kNumRawSymbols] = {};

    alignas(64) uint8_t raw_nbits_simd[16] = {};
    alignas(64) uint8_t raw_bits_simd[16] = {};

    uint8_t lz77_nbits[kNumLZ77] = {};
    uint16_t lz77_bits[kNumLZ77] = {};

    uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
    uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};


    static constexpr size_t kMaxNumSymbols = kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;

    static uint16_t BitReverse(size_t nbits, uint16_t bits)
    {
        constexpr uint16_t kNibbleLookup[16] = {
            0b0000,0b1000,0b0100,0b1100,
            0b0010,0b1010,0b0110,0b1110,
            0b0001,0b1001,0b0101,0b1101,
            0b0011,0b1011,0b0111,0b1111,
        };

        uint16_t rev16 = (kNibbleLookup[(bits >> 0) & 0xF] << 12)
            | (kNibbleLookup[(bits >>  4) & 0xF] << 8)
            | (kNibbleLookup[(bits >>  8) & 0xF] << 4)
            | (kNibbleLookup[(bits >> 12) & 0XF] << 0);

        return rev16 >> (16 - nbits);
    }

    // Create the prefix codes given the code lengths.
    // Supports the code lengths being split into two halves.
    static void ComputeCanonicalCode(const uint8_t *first_chunk_nbits,
                                     uint8_t *first_chunk_bits,
                                     size_t first_chunk_size,
                                     const uint8_t *second_chunk_nbits,
                                     uint16_t *second_chunk_bits,
                                     size_t second_chunk_size)
    {
        constexpr size_t kMaxCodeLength = 15;
        uint8_t code_length_counts[kMaxCodeLength + 1] = {};

        for (size_t i = 0; i < first_chunk_size; i++) {
            code_length_counts[first_chunk_nbits[i]]++;

            assert(first_chunk_nbits[i] <= kMaxCodeLength);
            assert(first_chunk_nbits[i] <= 8);
            assert(first_chunk_nbits[i] > 0);
        }

        for (size_t i = 0; i < second_chunk_size; i++) {
            code_length_counts[second_chunk_nbits[i]]++;
            assert(second_chunk_nbits[i] <= kMaxCodeLength);
        }

        uint16_t next_code[kMaxCodeLength + 1] = {};

        uint16_t code = 0;
        for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
            code = (code + code_length_counts[i - 1]) << 1;
            next_code[i] = code;
        }

        for (size_t i = 0; i < first_chunk_size; i++) {
            first_chunk_bits[i] = BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
        }
        for (size_t i = 0; i < second_chunk_size; i++) {
            second_chunk_bits[i] = BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
        }
    }

    static void ComputeCodeLengthsNonZeroImpl(const uint64_t *freqs, size_t n, size_t precision, uint64_t infty, uint8_t *min_limit, uint8_t *max_limit, uint8_t *nbits)
    {
        std::vector<uint64_t> dynp(((1U << precision) + 1) * (n + 1), infty);

        auto d = [&](size_t sym, size_t off) -> uint64_t & {
            return dynp[sym * ((1 << precision) + 1) + off];
        };

        d(0, 0) = 0;
        for (size_t sym = 0; sym < n; sym++) {
            for (uint64_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
                size_t off_delta = 1U << (precision - bits);
                for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
                    d(sym + 1, off + off_delta) = std::min(d(sym, off) + static_cast<uint64_t>(freqs[sym]) * bits,
                                                           d(sym + 1, off + off_delta));
                }
            }
        }

        size_t sym = n;
        size_t off = 1U << precision;

        assert(d(sym, off) != infty);

        while (sym-- > 0) {
            assert(off > 0);
            for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
                size_t off_delta = 1U << (precision - bits);
                if (off_delta <= off && d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
                    off -= off_delta;
                    nbits[sym] = bits;
                    break;
                }
            }
        }
    }

    // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
    // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
    // freqs[i]).
    static void ComputeCodeLengthsNonZero(const uint64_t *freqs, size_t n, uint8_t *min_limit, uint8_t *max_limit, uint8_t *nbits)
    {
        size_t precision = 0;
        size_t shortest_length = 255;
        uint64_t freqsum = 0;

        for (size_t i = 0; i < n; i++) {
            assert(freqs[i] != 0);
            freqsum += freqs[i];
            if (min_limit[i] < 1)
                min_limit[i] = 1;
            assert(min_limit[i] <= max_limit[i]);
            precision = std::max<size_t>(max_limit[i], precision);
            shortest_length = std::min<size_t>(min_limit[i], shortest_length);
        }
        // If all the minimum limits are greater than 1, shift precision so that we
        // behave as if the shortest was 1.
        precision -= shortest_length - 1;
        uint64_t infty = freqsum * precision;

        ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
                                      max_limit, nbits);

    }


    static void ComputeCodeLengths(const uint64_t *freqs, size_t n, const uint8_t *min_limit_in, const uint8_t *max_limit_in, uint8_t *nbits)
    {
        assert(n <= kMaxNumSymbols);
        uint64_t compact_freqs[kMaxNumSymbols];
        uint8_t min_limit[kMaxNumSymbols];
        uint8_t max_limit[kMaxNumSymbols];
        size_t ni = 0;
        for (size_t i = 0; i < n; i++) {
            if (freqs[i]) {
                compact_freqs[ni] = freqs[i];
                min_limit[ni] = min_limit_in[i];
                max_limit[ni] = max_limit_in[i];
                ni++;
            }
        }
        uint8_t num_bits[kMaxNumSymbols] = {};
        ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
                                  num_bits);
        ni = 0;
        for (size_t i = 0; i < n; i++) {
            nbits[i] = 0;
            if (freqs[i]) {
                nbits[i] = num_bits[ni++];
            }
        }
    }

    // Invalid code, used to construct arrays.
    PrefixCode() {}

    template <typename BitDepth>
    PrefixCode(BitDepth, uint64_t *raw_counts, uint64_t *lz77_counts)
    {
        // "merge" together all the lz77 counts in a single symbol for the level 1
        // table (containing just the raw symbols, up to length 7).
        uint64_t level1_counts[kNumRawSymbols + 1];
        memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
        size_t numraw = kNumRawSymbols;
        while (numraw > 0 && level1_counts[numraw - 1] == 0)
            numraw--;

        level1_counts[numraw] = 0;
        for (size_t i = 0; i < kNumLZ77; i++) {
            level1_counts[numraw] += lz77_counts[i];
        }
        uint8_t level1_nbits[kNumRawSymbols + 1] = {};

        ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
                           BitDepth::kMaxRawLength, level1_nbits);


        uint8_t level2_nbits[kNumLZ77] = {};
        uint8_t min_lengths[kNumLZ77] = {};

        uint8_t l = 15 - level1_nbits[numraw];
        uint8_t max_lengths[kNumLZ77];

        for (size_t i = 0; i < kNumLZ77; i++) {
            max_lengths[i] = l;
        }
        size_t num_lz77 = kNumLZ77;

        while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0)
            num_lz77--;

        ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
                           level2_nbits);

        for (size_t i = 0; i < numraw; i++) {
            raw_nbits[i] = level1_nbits[i];
        }

        for (size_t i = 0; i < num_lz77; i++) {
            lz77_nbits[i] = level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
        }

        ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
                             kNumLZ77);

        // Prepare lz77 cache
        for (size_t count = 0; count < kLZ77CacheSize; count++) {
            unsigned token, nbits, bits;
            EncodeHybridUintLZ77(count, &token, &nbits, &bits);
            lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
            lz77_cache_bits[count] = (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) | raw_bits[0];
        }
    }

    void WriteTo(BitWriter *writer) const
    {
        uint64_t code_length_counts[18] = {};
        code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);

        for (size_t i = 0; i < kNumRawSymbols; i++) {
            code_length_counts[raw_nbits[i]]++;
        }

        for (size_t i = 0; i < kNumLZ77; i++) {
            code_length_counts[lz77_nbits[i]]++;
        }

        uint8_t code_length_nbits[18] = {};
        uint8_t code_length_nbits_min[18] = {};
        uint8_t code_length_nbits_max[18] = {
            5,5,5,5,5,5,
            5,5,5,5,5,5,
            5,5,5,5,5,5,
        };

        ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
                           code_length_nbits_max, code_length_nbits);

        writer->Write(2, 0b00); // HSKIP = 0, i.e. don't skip code lengths.

        // As per Brotli RFC.
        uint8_t code_length_order[18] = {1, 2, 3, 4, 0, 5, 17, 6, 16,
                                         7, 8, 9, 10, 11, 12, 13, 14, 15};

        uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
        uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};

        // Encode lengths of code lengths.
        size_t num_code_lengths = 18;

        while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
            num_code_lengths--;
        }

        for (size_t i = 0; i < num_code_lengths; i++) {
            int symbol = code_length_nbits[code_length_order[i]];

            writer->Write(code_length_length_nbits[symbol],
                          code_length_length_bits[symbol]);
        }

        // Compute the canonical codes for the codes that represent the lengths of
        // the actual codes for data.
        uint16_t code_length_bits[18] = {};

        ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
                             code_length_bits, 18);
        // Encode raw bit code lengths.
        for (size_t i = 0; i < kNumRawSymbols; i++) {
            writer->Write(code_length_nbits[raw_nbits[i]],
                          code_length_bits[raw_nbits[i]]);
        }
        size_t num_lz77 = kNumLZ77;
        while (lz77_nbits[num_lz77 - 1] == 0) {
            num_lz77--;
        }
        // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
        // 205.
        static_assert(kLZ77Offset == 224);
        static_assert(kNumRawSymbols == 19);

        writer->Write(code_length_nbits[17], code_length_bits[17]);
        writer->Write(3, 0b010); // 5
        writer->Write(code_length_nbits[17], code_length_bits[17]);
        writer->Write(3, 0b000); // (5-2)*8 + 3 = 27
        writer->Write(code_length_nbits[17], code_length_bits[17]);
        writer->Write(3, 0b010); // (27-2)*8 + 5 = 205

        // Encode LZ77 symbols, with values 224+i.
        for (size_t i = 0; i < num_lz77; i++) {
            writer->Write(code_length_nbits[lz77_nbits[i]],
                          code_length_bits[lz77_nbits[i]]);
        }
    }
};

void EncodeHybridUint000(uint32_t value, uint32_t *token, uint32_t *nbits, uint32_t *bits)
{
    uint32_t n = ff_log(value);
    *token = value ? n + 1 : 0;
    *nbits = value ? n : 0;
    *bits = value ? value - (1 << n) : 0;
}

constexpr static size_t kLogChunkSize = 3;

constexpr static size_t kChunkSize = 1 << kLogChunkSize;

template <typename Residual>
void GenericEncodeChunk(const Residual *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output)
{
    for (size_t ix = skip; ix < n; ix++) {
        unsigned token, nbits, bits;
        EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
        output.Write(code.raw_nbits[token] + nbits,
                     code.raw_bits[token] | bits << code.raw_nbits[token]);
    }
}

struct UpTo8Bits {
    size_t bitdepth;
    explicit UpTo8Bits(size_t bitdepth)
        : bitdepth(bitdepth)
    {
        assert(bitdepth <= 8);
    }
    // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
    // symbols, we could actually go up to 8 Huffman bits as we have at most 8
    // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
    // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
    // LZ77 lengths and has no limitations except allowing to represent 32 symbols
    // in total.
    static constexpr uint8_t kMinRawLength[12] = {};
    static constexpr uint8_t kMaxRawLength[12] = {
        7,7,7,7,7,7,
        7,7,7,7,7,10
    };
    static size_t MaxEncodedBitsPerSample() { return 16; }
    static constexpr size_t kInputBytes = 1;
    using pixel_t = int16_t;
    using upixel_t = uint16_t;


    static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output)
    {
        GenericEncodeChunk(residuals, n, skip, code, output);
    }

    size_t NumSymbols(bool doing_ycocg) const
    {
        // values gain 1 bit for YCoCg, 1 bit for prediction.
        // Maximum symbol is 1 + effective bit depth of residuals.
        if (doing_ycocg) {
            return bitdepth + 3;
        } else {
            return bitdepth + 2;
        }
    }
};
constexpr uint8_t UpTo8Bits::kMinRawLength[];
constexpr uint8_t UpTo8Bits::kMaxRawLength[];

struct From9To13Bits {
    size_t bitdepth;
    explicit From9To13Bits(size_t bitdepth)
        : bitdepth(bitdepth)
    {
        assert(bitdepth <= 13 && bitdepth >= 9);
    }
    // Last symbol is used for LZ77 lengths and has no limitations except allowing
    // to represent 32 symbols in total.
    // We cannot fit all the bits in a u16, so do not even try and use up to 8
    // bits per raw symbol.
    // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
    // any special tricks.
    static constexpr uint8_t kMinRawLength[17] = {};
    static constexpr uint8_t kMaxRawLength[17] = {
        8,8,8,8,8,8,8,8,
        8,8,8,8,8,8,8,8,
        10
    };

    static size_t MaxEncodedBitsPerSample() { return 21; }
    static constexpr size_t kInputBytes = 2;
    using pixel_t = int16_t;
    using upixel_t = uint16_t;


    static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output)
    {
        GenericEncodeChunk(residuals, n, skip, code, output);
    }

    size_t NumSymbols(bool doing_ycocg) const
    {
        // values gain 1 bit for YCoCg, 1 bit for prediction.
        // Maximum symbol is 1 + effective bit depth of residuals.
        if (doing_ycocg) {
            return bitdepth + 3;
        } else {
            return bitdepth + 2;
        }
    }
};
constexpr uint8_t From9To13Bits::kMinRawLength[];
constexpr uint8_t From9To13Bits::kMaxRawLength[];

void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2)
{
    assert(nbits1 == 8);
    assert(nbits2 == 8);
    assert(bits2 == (bits1 | 128));
}

struct Exactly14Bits {
    explicit Exactly14Bits(size_t bitdepth) { assert(bitdepth == 14); }
    // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
    // have exactly 8, and no other symbol to have 8 or more. This ensures that
    // the representation for 15 and 16 is identical up to one bit.
    static constexpr uint8_t kMinRawLength[18] = {
        0,0,0,0,0,0,
        0,0,0,0,0,0,
        0,0,0,8,8,7,
    };

    static constexpr uint8_t kMaxRawLength[18] = {
        7,7,7,7,7,
        7,7,7,7,7,
        7,7,7,7,7,
        8,8,10,
    };

    static constexpr size_t bitdepth = 14;
    static size_t MaxEncodedBitsPerSample() { return 22; }
    static constexpr size_t kInputBytes = 2;
    using pixel_t = int16_t;
    using upixel_t = uint16_t;


    static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output)
    {
        GenericEncodeChunk(residuals, n, skip, code, output);
    }

    size_t NumSymbols(bool) const { return 17; }
};
constexpr uint8_t Exactly14Bits::kMinRawLength[];
constexpr uint8_t Exactly14Bits::kMaxRawLength[];

struct MoreThan14Bits {
    size_t bitdepth;
    explicit MoreThan14Bits(size_t bitdepth)
        : bitdepth(bitdepth)
    {
        assert(bitdepth > 14);
        assert(bitdepth <= 16);
    }
    // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
    // have exactly 8, and no other symbol to have 8 or more. This ensures that
    // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
    // bit.
    static constexpr uint8_t kMinRawLength[20] = {
        0,0,0,0,0,
        0,0,0,0,0,
        0,0,0,8,8,
        8,8,8,8,7,
    };
    static constexpr uint8_t kMaxRawLength[20] = {
        7,7,7,7,7,
        7,7,7,7,7,
        7,7,7,8,8,
        8,8,8,8,10
    };

    static size_t MaxEncodedBitsPerSample() { return 24; }
    static constexpr size_t kInputBytes = 2;

    using pixel_t = int32_t;
    using upixel_t = uint32_t;


    static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output)
    {
        GenericEncodeChunk(residuals, n, skip, code, output);
    }
    size_t NumSymbols(bool) const { return 19; }
};
constexpr uint8_t MoreThan14Bits::kMinRawLength[];
constexpr uint8_t MoreThan14Bits::kMaxRawLength[];

void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height, const PrefixCode code[4], BitWriter *output)
{
    output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
    // No patches, spline or noise.
    output->Write(1, 1); // default DC dequantization factors (?)
    output->Write(1, 1); // use global tree / histograms
    output->Write(1, 0); // no lz77 for the tree

    output->Write(1, 1);        // simple code for the tree's context map
    output->Write(2, 0);        // all contexts clustered together
    output->Write(1, 1);        // use prefix code for tree
    output->Write(4, 0);        // 000 hybrid uint
    output->Write(6, 0b100011); // Alphabet size is 4 (var16)
    output->Write(2, 1);        // simple prefix code
    output->Write(2, 3);        // with 4 symbols
    output->Write(2, 0);
    output->Write(2, 1);
    output->Write(2, 2);
    output->Write(2, 3);
    output->Write(1, 0); // First tree encoding option
    // Huffman table + extra bits for the tree.
    uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
    uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
    // Write a tree with a leaf per channel, and gradient predictor for every
    // leaf.
    for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
                   0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
        output->Write(symbol_nbits[v], symbol_bits[v]);
    }

    output->Write(1, 1);    // Enable lz77 for the main bitstream
    output->Write(2, 0b00); // lz77 offset 224
    static_assert(kLZ77Offset == 224, "");
    output->Write(4, 0b1010); // lz77 min length 7
    // 400 hybrid uint config for lz77
    output->Write(4, 4);
    output->Write(3, 0);
    output->Write(3, 0);

    output->Write(1, 1); // simple code for the context map
    output->Write(2, 3); // 3 bits per entry
    output->Write(3, 4); // channel 3
    output->Write(3, 3); // channel 2
    output->Write(3, 2); // channel 1
    output->Write(3, 1); // channel 0
    output->Write(3, 0); // distance histogram first

    output->Write(1, 1); // use prefix codes
    output->Write(4, 0); // 000 hybrid uint config for distances (only need 0)
    for (size_t i = 0; i < 4; i++) {
        output->Write(4, 0); // 000 hybrid uint config for symbols (only <= 10)
    }

    // Distance alphabet size:
    output->Write(5, 0b00001); // 2: just need 1 for RLE (i.e. distance 1)
    // Symbol + LZ77 alphabet size:
    for (size_t i = 0; i < 4; i++) {
        output->Write(1, 1);   // > 1
        output->Write(4, 8);   // <= 512
        output->Write(8, 256); // == 512
    }

    // Distance histogram:
    output->Write(2, 1); // simple prefix code
    output->Write(2, 0); // with one symbol
    output->Write(1, 1); // 1

    // Symbol + lz77 histogram:
    for (size_t i = 0; i < 4; i++) {
        code[i].WriteTo(output);
    }

    // Group header for global modular image.
    output->Write(1, 1); // Global tree
    output->Write(1, 1); // All default wp
}

void PrepareDCGlobal(bool is_single_group, size_t width, size_t height, size_t nb_chans, const PrefixCode code[4], BitWriter *output)
{
    PrepareDCGlobalCommon(is_single_group, width, height, code, output);
    if (nb_chans > 2) {
        output->Write(2, 0b01);    // 1 transform
        output->Write(2, 0b00);    // RCT
        output->Write(5, 0b00000); // Starting from ch 0
        output->Write(2, 0b00);    // YCoCg
    } else {
        output->Write(2, 0b00); // no transforms
    }
    if (!is_single_group) {
        output->ZeroPadToByte();
    }
}

template <typename BitDepth>
struct ChunkEncoder {
    FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode &code, BitWriter &output)
    {
        if (count == 0)
            return;
        count -= kLZ77MinLength + 1;
        if (count < kLZ77CacheSize) {
            output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
        } else {
            unsigned token, nbits, bits;
            EncodeHybridUintLZ77(count, &token, &nbits, &bits);
            uint64_t wbits = bits;
            wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
            wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
            output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
        }
    }

    FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t *residuals, size_t skip, size_t n)
    {
        EncodeRle(run, *code, *output);
        BitDepth::EncodeChunk(residuals, n, skip, *code, *output);
    }

    inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }

    const PrefixCode *code;
    BitWriter *output;
};

template <typename BitDepth>
struct ChunkSampleCollector {

    uint64_t *raw_counts;
    uint64_t *lz77_counts;

    FJXL_INLINE void Rle(size_t count)
    {
        if (count == 0)
            return;
        raw_counts[0] += 1;
        count -= kLZ77MinLength + 1;
        unsigned token, nbits, bits;
        EncodeHybridUintLZ77(count, &token, &nbits, &bits);
        lz77_counts[token]++;
    }

    FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t *residuals, size_t skip, size_t n)
    {
        // Run is broken. Encode the run and encode the individual vector.
        Rle(run);
        for (size_t ix = skip; ix < n; ix++) {
            unsigned token, nbits, bits;
            EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
            raw_counts[token]++;
        }
    }

    // don't count final run since we don't know how long it really is
    void Finalize(size_t run) {}
};

constexpr uint32_t PackSigned(int32_t value)
{
    return (static_cast<uint32_t>(value) << 1) ^ ((static_cast<uint32_t>(~value) >> 31) - 1);
}

template <typename T, typename BitDepth>
struct ChannelRowProcessor {
    // Invariant: run == 0 or run > kLZ77MinLength.
    size_t run = 0;

    using upixel_t = typename BitDepth::upixel_t;
    using pixel_t = typename BitDepth::pixel_t;

    T t;

    void ProcessChunk(const pixel_t *row, const pixel_t *row_left, const pixel_t *row_top, const pixel_t *row_topleft, size_t n)
    {
        alignas(64) upixel_t residuals[kChunkSize] = {};
        size_t prefix_size = 0;
        size_t required_prefix_size = 0;

        for (size_t ix = 0; ix < kChunkSize; ix++) {
            pixel_t px = row[ix];
            pixel_t left = row_left[ix];
            pixel_t top = row_top[ix];
            pixel_t topleft = row_topleft[ix];
            pixel_t ac = left - topleft;
            pixel_t ab = left - top;
            pixel_t bc = top - topleft;
            pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) + static_cast<upixel_t>(top));
            pixel_t d = ab ^ bc;
            pixel_t clamp = d < 0 ? top : left;
            pixel_t s = ac ^ bc;
            pixel_t pred = s < 0 ? grad : clamp;
            residuals[ix] = PackSigned(px - pred);
            prefix_size = prefix_size == required_prefix_size ? prefix_size + (residuals[ix] == 0) : prefix_size;
            required_prefix_size += 1;
        }

        prefix_size = std::min(n, prefix_size);
        if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
            // Run continues, nothing to do.
            run += prefix_size;
        } else if (prefix_size + run > kLZ77MinLength) {
            // Run is broken. Encode the run and encode the individual vector.
            t.Chunk(run + prefix_size, residuals, prefix_size, n);
            run = 0;
        } else {
            // There was no run to begin with.
            t.Chunk(0, residuals, 0, n);
        }
    }

    void ProcessRow(const pixel_t *row, const pixel_t *row_left, const pixel_t *row_top, const pixel_t *row_topleft, size_t xs)
    {
        for (size_t x = 0; x < xs; x += kChunkSize) {
            ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
                         std::min(kChunkSize, xs - x));
        }
    }

    void Finalize() {
        t.Finalize(this->run);
    }
};

uint16_t LoadLE16(const unsigned char *ptr)
{
    return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
}

uint16_t SwapEndian(uint16_t in)
{
    return (in >> 8) | (in << 8);
}


template <typename pixel_t>
void FillRowG8(const unsigned char *rgba, size_t oxs, pixel_t *luma)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        luma[x] = rgba[x];
    }
}

template <bool big_endian, typename pixel_t>
void FillRowG16(const unsigned char *rgba, size_t oxs, pixel_t *luma)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        uint16_t val = LoadLE16(rgba + 2 * x);
        if (big_endian) {
            val = SwapEndian(val);
        }
        luma[x] = val;
    }
}

template <typename pixel_t>
void FillRowGA8(const unsigned char *rgba, size_t oxs, pixel_t *luma, pixel_t *alpha)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        luma[x] = rgba[2 * x];
        alpha[x] = rgba[2 * x + 1];
    }
}

template <bool big_endian, typename pixel_t>
void FillRowGA16(const unsigned char *rgba, size_t oxs, pixel_t *luma, pixel_t *alpha)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        uint16_t l = LoadLE16(rgba + 4 * x);
        uint16_t a = LoadLE16(rgba + 4 * x + 2);
        if (big_endian) {
            l = SwapEndian(l);
            a = SwapEndian(a);
        }
        luma[x] = l;
        alpha[x] = a;
    }
}

template <typename pixel_t>
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t *y, pixel_t *co, pixel_t *cg)
{
    *co = r - b;
    pixel_t tmp = b + (*co >> 1);
    *cg = g - tmp;
    *y = tmp + (*cg >> 1);
}


template <typename pixel_t>
void FillRowRGB8(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        uint16_t r = rgba[3 * x];
        uint16_t g = rgba[3 * x + 1];
        uint16_t b = rgba[3 * x + 2];
        StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
    }
}

template <bool big_endian, typename pixel_t>
void FillRowRGB16(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        uint16_t r = LoadLE16(rgba + 6 * x);
        uint16_t g = LoadLE16(rgba + 6 * x + 2);
        uint16_t b = LoadLE16(rgba + 6 * x + 4);

        if (big_endian) {
            r = SwapEndian(r);
            g = SwapEndian(g);
            b = SwapEndian(b);
        }

        StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
    }
}

template <typename pixel_t>
void FillRowRGBA8(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg, pixel_t *alpha)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        uint16_t r = rgba[4 * x];
        uint16_t g = rgba[4 * x + 1];
        uint16_t b = rgba[4 * x + 2];
        uint16_t a = rgba[4 * x + 3];
        StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
        alpha[x] = a;
    }
}

template <bool big_endian, typename pixel_t>
void FillRowRGBA16(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg, pixel_t *alpha)
{
    size_t x = 0;

    for (; x < oxs; x++) {
        uint16_t r = LoadLE16(rgba + 8 * x);
        uint16_t g = LoadLE16(rgba + 8 * x + 2);
        uint16_t b = LoadLE16(rgba + 8 * x + 4);
        uint16_t a = LoadLE16(rgba + 8 * x + 6);

        if (big_endian) {
            r = SwapEndian(r);
            g = SwapEndian(g);
            b = SwapEndian(b);
            a = SwapEndian(a);
        }
        StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
        alpha[x] = a;
    }
}

template <typename Processor, typename BitDepth>
void ProcessImageArea(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t yskip, size_t ys, size_t row_stride, BitDepth bitdepth, size_t nb_chans, bool big_endian, Processor *processors)
{
    using pixel_t = typename BitDepth::pixel_t;

    constexpr size_t kPadding = 32;
    constexpr size_t kAlign = 64;
    constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
    constexpr size_t kNumPx = (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels * kAlignPixels;


    auto align = [=](pixel_t *ptr) {
        size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign;
        if (offset) {
            ptr += offset / sizeof(pixel_t);
        }
        return ptr;
    };


    std::vector<std::array<std::array<pixel_t, kNumPx>, 2> > group_data(nb_chans);

    for (size_t y = 0; y < ys; y++) {
        const auto rgba_row = rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
        pixel_t *crow[4] = {};
        pixel_t *prow[4] = {};
        for (size_t i = 0; i < nb_chans; i++) {
            crow[i] = align(&group_data[i][y & 1][kPadding]);
            prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
        }

        // Pre-fill rows with YCoCg converted pixels.
        if (nb_chans == 1) {
            if (BitDepth::kInputBytes == 1) {
                FillRowG8(rgba_row, xs, crow[0]);
            } else if (big_endian) {
                FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
            } else {
                FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
            }
        } else if (nb_chans == 2) {
            if (BitDepth::kInputBytes == 1) {
                FillRowGA8(rgba_row, xs, crow[0], crow[1]);
            } else if (big_endian) {
                FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
            } else {
                FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
            }
        } else if (nb_chans == 3) {
            if (BitDepth::kInputBytes == 1) {
                FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
            } else if (big_endian) {
                FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
                                                  crow[2]);
            } else {
                FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
                                                   crow[2]);
            }
        } else {
            if (BitDepth::kInputBytes == 1) {
                FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
            } else if (big_endian) {
                FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
                                                   crow[2], crow[3]);
            } else {
                FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
                                                    crow[2], crow[3]);
            }
        }
        // Deal with x == 0.
        for (size_t c = 0; c < nb_chans; c++) {
            *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
            // Fix topleft.
            *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
        }
        if (y < yskip)
            continue;
        for (size_t c = 0; c < nb_chans; c++) {
            // Get pointers to px/left/top/topleft data to speedup loop.
            const pixel_t *row = crow[c];
            const pixel_t *row_left = crow[c] - 1;
            const pixel_t *row_top = y == 0 ? row_left : prow[c];
            const pixel_t *row_topleft = y == 0 ? row_left : prow[c] - 1;

            processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
        }
    }
    for (size_t c = 0; c < nb_chans; c++) {
        processors[c].Finalize();
    }
}

template <typename BitDepth>
void WriteACSection(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t ys, size_t row_stride, bool is_single_group, BitDepth bitdepth, size_t nb_chans, bool big_endian, const PrefixCode code[4], std::array<BitWriter, 4> &output)
{
    for (size_t i = 0; i < nb_chans; i++) {
        if (is_single_group && i == 0)
            continue;
        output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
    }
    if (!is_single_group) {
        // Group header for modular image.
        // When the image is single-group, the global modular image is the one
        // that contains the pixel data, and there is no group header.
        output[0].Write(1, 1);    // Global tree
        output[0].Write(1, 1);    // All default wp
        output[0].Write(2, 0b00); // 0 transforms
    }

    ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
    for (size_t c = 0; c < nb_chans; c++) {
        row_encoders[c].t = ChunkEncoder<BitDepth>();
        row_encoders[c].t.output = &output[c];
        row_encoders[c].t.code = &code[c];
    }

    ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> >(
        rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
        row_encoders);
}

constexpr int kHashExp = 16;
constexpr uint32_t kHashSize = 1 << kHashExp;
constexpr uint32_t kHashMultiplier = 2654435761;
constexpr int kMaxColors = 512;

// can be any function that returns a value in 0 .. kHashSize-1
// has to map 0 to 0
inline uint32_t pixel_hash(uint32_t p)
{
    return (p * kHashMultiplier) >> (32 - kHashExp);
}

template <size_t nb_chans>
void FillRowPalette(const unsigned char *inrow, size_t xs, const int16_t *lookup, int16_t *out)
{
    for (size_t x = 0; x < xs; x++) {
        uint32_t p = 0;
        memcpy(&p, inrow + x * nb_chans, nb_chans);
        out[x] = lookup[pixel_hash(p)];
    }
}

template <typename Processor>
void ProcessImageAreaPalette(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t yskip,
                             size_t ys, size_t row_stride, const int16_t *lookup,
                             size_t nb_chans, Processor *processors)
{
    constexpr size_t kPadding = 32;

    std::vector<std::array<int16_t, 256 + kPadding * 2> > group_data(2);
    Processor &row_encoder = processors[0];

    for (size_t y = 0; y < ys; y++) {
        // Pre-fill rows with palette converted pixels.
        const unsigned char *inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
        int16_t *outrow = &group_data[y & 1][kPadding];
        if (nb_chans == 1) {
            FillRowPalette<1>(inrow, xs, lookup, outrow);
        } else if (nb_chans == 2) {
            FillRowPalette<2>(inrow, xs, lookup, outrow);
        } else if (nb_chans == 3) {
            FillRowPalette<3>(inrow, xs, lookup, outrow);
        } else if (nb_chans == 4) {
            FillRowPalette<4>(inrow, xs, lookup, outrow);
        }
        // Deal with x == 0.
        group_data[y & 1][kPadding - 1] = y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
        // Fix topleft.
        group_data[(y - 1) & 1][kPadding - 1] = y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
        // Get pointers to px/left/top/topleft data to speedup loop.
        const int16_t *row = &group_data[y & 1][kPadding];
        const int16_t *row_left = &group_data[y & 1][kPadding - 1];
        const int16_t *row_top = y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
        const int16_t *row_topleft = y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];

        row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
    }
    row_encoder.Finalize();
}

void WriteACSectionPalette(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t ys, size_t row_stride, bool is_single_group, const PrefixCode code[4], const int16_t *lookup, size_t nb_chans, BitWriter &output)
{
    if (!is_single_group) {
        output.Allocate(16 * xs * ys + 4);
        // Group header for modular image.
        // When the image is single-group, the global modular image is the one
        // that contains the pixel data, and there is no group header.
        output.Write(1, 1);    // Global tree
        output.Write(1, 1);    // All default wp
        output.Write(2, 0b00); // 0 transforms
    }

    ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;

    row_encoder.t = ChunkEncoder<UpTo8Bits>();
    row_encoder.t.output = &output;
    row_encoder.t.code = &code[is_single_group?1:0];

    ProcessImageAreaPalette<
        ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
        rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
}

template <typename BitDepth>
void CollectSamples(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t row_stride,
                    size_t row_count, uint64_t raw_counts[4][kNumRawSymbols],
                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
                    bool palette, BitDepth bitdepth, size_t nb_chans,
                    bool big_endian, const int16_t *lookup)
{
    if (palette) {
        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
            row_sample_collectors[4];
        for (size_t c = 0; c < nb_chans; c++) {
            row_sample_collectors[c].t = ChunkSampleCollector<UpTo8Bits>();
            row_sample_collectors[c].t.raw_counts = raw_counts[is_single_group ? 1 : 0];
            row_sample_collectors[c].t.lz77_counts = lz77_counts[is_single_group ? 1 : 0];
        }
        ProcessImageAreaPalette<
            ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits> >(
            rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
            row_sample_collectors);
    } else {
        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
            row_sample_collectors[4];
        for (size_t c = 0; c < nb_chans; c++) {
            row_sample_collectors[c].t = ChunkSampleCollector<BitDepth>();
            row_sample_collectors[c].t.raw_counts = raw_counts[c];
            row_sample_collectors[c].t.lz77_counts = lz77_counts[c];
        }
        ProcessImageArea<
            ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth> >(
            rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
            big_endian, row_sample_collectors);
    }
}

void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
                            const PrefixCode code[4], const std::vector<uint32_t> &palette,
                            size_t pcolors, BitWriter *output)
{
    PrepareDCGlobalCommon(is_single_group, width, height, code, output);
    output->Write(2, 0b01);    // 1 transform
    output->Write(2, 0b01);    // Palette
    output->Write(5, 0b00000); // Starting from ch 0
    output->Write(2, 0b10);    // 4-channel palette (RGBA)
    // pcolors <= kMaxColors + kChunkSize - 1
    static_assert(kMaxColors + kChunkSize < 1281,
                  "add code to signal larger palette sizes");
    if (pcolors < 256) {
        output->Write(2, 0b00);
        output->Write(8, pcolors);
    } else {
        output->Write(2, 0b01);
        output->Write(10, pcolors - 256);
    }

    output->Write(2, 0b00); // nb_deltas == 0
    output->Write(4, 0);    // Zero predictor for delta palette
    // Encode palette

    ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
    row_encoder.t = ChunkEncoder<UpTo8Bits>();
    row_encoder.t.output = output;
    row_encoder.t.code = &code[0];

    int16_t p[4][32 + 1024] = {};
    uint8_t prgba[4];
    size_t i = 0;
    size_t have_zero = 0;
    if (palette[pcolors - 1] == 0)
        have_zero = 1;
    for (; i < pcolors; i++) {
        memcpy(prgba, &palette[i], 4);
        p[0][16 + i + have_zero] = prgba[0];
        p[1][16 + i + have_zero] = prgba[1];
        p[2][16 + i + have_zero] = prgba[2];
        p[3][16 + i + have_zero] = prgba[3];
    }
    p[0][15] = 0;
    row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors);
    p[1][15] = p[0][16];
    p[0][15] = p[0][16];
    row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors);
    p[2][15] = p[1][16];
    p[1][15] = p[1][16];
    row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors);
    p[3][15] = p[2][16];
    p[2][15] = p[2][16];
    row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors);
    row_encoder.Finalize();

    if (!is_single_group) {
        output->ZeroPadToByte();
    }
}

template <typename BitDepth>
JxlFastLosslessFrameState *LLEnc(const unsigned char *rgba, size_t width, size_t stride, size_t height,
                                 BitDepth bitdepth, size_t nb_chans, bool big_endian,
                                 int effort, void *runner_opaque, FJxlParallelRunner runner)
{
    assert(width != 0);
    assert(height != 0);
    assert(stride >= nb_chans * BitDepth::kInputBytes * width);

    // Count colors to try palette
    std::vector<uint32_t> palette(kHashSize);
    palette[0] = 1;
    std::vector<int16_t> lookup(kHashSize);
    lookup[0] = 0;
    int pcolors = 0;

    bool collided = effort < 2 || bitdepth.bitdepth != 8 || nb_chans < 4;
    // todo: also do rgb palette
    for (size_t y = 0; y < height && !collided; y++) {
        const unsigned char *r = rgba + stride * y;
        size_t x = 0;
        if (nb_chans == 4) {
            // this is just an unrolling of the next loop
            for (; x + 7 < width; x += 8) {
                uint32_t p[8], index[8];
                memcpy(p, r + x * 4, 32);
                for (int i = 0; i < 8; i++)
                    index[i] = pixel_hash(p[i]);
                for (int i = 0; i < 8; i++) {
                    uint32_t init_entry = index[i] ? 0 : 1;
                    if (init_entry != palette[index[i]] && p[i] != palette[index[i]]) {
                        collided = true;
                    }
                }
                for (int i = 0; i < 8; i++)
                    palette[index[i]] = p[i];
            }
            for (; x < width; x++) {
                uint32_t p;
                memcpy(&p, r + x * 4, 4);
                uint32_t index = pixel_hash(p);
                uint32_t init_entry = index ? 0 : 1;
                if (init_entry != palette[index] && p != palette[index]) {
                    collided = true;
                }
                palette[index] = p;
            }
        } else {
            for (; x < width; x++) {
                uint32_t p = 0;
                memcpy(&p, r + x * nb_chans, nb_chans);
                uint32_t index = pixel_hash(p);
                uint32_t init_entry = index ? 0 : 1;
                if (init_entry != palette[index] && p != palette[index]) {
                    collided = true;
                }
                palette[index] = p;
            }
        }
    }

    int nb_entries = 0;
    if (!collided) {
        if (palette[0] == 0)
            pcolors = 1;
        if (palette[0] == 1)
            palette[0] = 0;
        bool have_color = false;
        uint8_t minG = 255, maxG = 0;

        for (uint32_t k = 0; k < kHashSize; k++) {
            if (palette[k] == 0)
                continue;
            uint8_t p[4];
            memcpy(p, &palette[k], 4);
            // move entries to front so sort has less work
            palette[nb_entries] = palette[k];
            if (p[0] != p[1] || p[0] != p[2])
                have_color = true;
            if (p[1] < minG)
                minG = p[1];
            if (p[1] > maxG)
                maxG = p[1];
            nb_entries++;
            // don't do palette if too many colors are needed
            if (nb_entries + pcolors > kMaxColors) {
                collided = true;
                break;
            }
        }
        if (!have_color) {
            // don't do palette if it's just grayscale without many holes
            if (maxG - minG < nb_entries * 1.4f)
                collided = true;
        }
    }
    if (!collided) {
        std::sort(
            palette.begin(), palette.begin() + nb_entries,
            [](uint32_t ap, uint32_t bp) {
                if (ap == 0)
                    return false;
                if (bp == 0)
                    return true;
                uint8_t a[4], b[4];
                memcpy(a, &ap, 4);
                memcpy(b, &bp, 4);
                float ay, by;
                ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
                by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
                return ay < by; // sort on alpha*luma
            });
        for (int k = 0; k < nb_entries; k++) {
            if (palette[k] == 0)
                break;
            lookup[pixel_hash(palette[k])] = pcolors++;
        }
    }

    size_t num_groups_x = (width + 255) / 256;
    size_t num_groups_y = (height + 255) / 256;
    size_t num_dc_groups_x = (width + 2047) / 2048;
    size_t num_dc_groups_y = (height + 2047) / 2048;

    uint64_t raw_counts[4][kNumRawSymbols] = {};
    uint64_t lz77_counts[4][kNumLZ77] = {};

    bool onegroup = num_groups_x == 1 && num_groups_y == 1;

    // sample the middle (effort * 2) rows of every group
    for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
        size_t xg = g % num_groups_x;
        size_t yg = g / num_groups_x;
        int y_offset = yg * 256;

        int y_max = std::min<size_t>(height - yg * 256, 256);
        int y_begin = y_offset + std::max<int>(0, y_max - 2 * effort) / 2;
        int y_count = std::min<int>(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1);
        int x_max = std::min<size_t>(width - xg * 256, 256) / kChunkSize * kChunkSize;

        CollectSamples(rgba, xg * 256, y_begin, x_max, stride, y_count, raw_counts,
                       lz77_counts, onegroup, !collided, bitdepth, nb_chans,
                       big_endian, lookup.data());
    }

    // TODO(veluca): can probably improve this and make it bitdepth-dependent.
    uint64_t base_raw_counts[kNumRawSymbols] = {
        3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
        5, 1, 1, 1, 1, 1, 1, 1, 1};

    bool doing_ycocg = nb_chans > 2 && collided;
    for (size_t i = bitdepth.NumSymbols(doing_ycocg); i < kNumRawSymbols; i++) {
        base_raw_counts[i] = 0;
    }

    for (size_t c = 0; c < 4; c++) {
        for (size_t i = 0; i < kNumRawSymbols; i++) {
            raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
        }
    }

    if (!collided) {
        unsigned token, nbits, bits;
        EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
        // ensure all palette indices can actually be encoded
        for (size_t i = 0; i < token + 1; i++)
            raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
        // these tokens are only used for the palette itself so they can get a bad
        // code
        for (size_t i = token + 1; i < 10; i++)
            raw_counts[0][i] = 1;
    }

    uint64_t base_lz77_counts[kNumLZ77] = {
        29, 27, 25, 23, 21, 21, 19, 18, 21,
        17, 16, 15, 15, 14, 13, 13, 137, 98,
        61, 34, 1, 1, 1, 1, 1, 1, 1,
        1};

    for (auto &lz77_count : lz77_counts) {
        for (size_t i = 0; i < kNumLZ77; i++) {
            lz77_count[i] = (lz77_count[i] << 8) + base_lz77_counts[i];
        }
    }

    alignas(64) PrefixCode hcode[4];
    for (size_t i = 0; i < 4; i++) {
        hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
    }

    size_t num_groups = onegroup ? 1 : (2 + num_dc_groups_x * num_dc_groups_y + num_groups_x * num_groups_y);

    JxlFastLosslessFrameState *frame_state = new JxlFastLosslessFrameState();

    frame_state->width = width;
    frame_state->height = height;
    frame_state->nb_chans = nb_chans;
    frame_state->bitdepth = bitdepth.bitdepth;

    frame_state->group_data = std::vector<std::array<BitWriter, 4> >(num_groups);
    if (collided) {
        PrepareDCGlobal(onegroup, width, height, nb_chans, hcode,
                        &frame_state->group_data[0][0]);
    } else {
        PrepareDCGlobalPalette(onegroup, width, height, hcode, palette, pcolors,
                               &frame_state->group_data[0][0]);
    }

    auto run_one = [&](size_t g) {
        size_t xg = g % num_groups_x;
        size_t yg = g / num_groups_x;
        size_t group_id = onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g);
        size_t xs = std::min<size_t>(width - xg * 256, 256);
        size_t ys = std::min<size_t>(height - yg * 256, 256);
        size_t x0 = xg * 256;
        size_t y0 = yg * 256;
        auto &gd = frame_state->group_data[group_id];
        if (collided) {
            WriteACSection(rgba, x0, y0, xs, ys, stride, onegroup, bitdepth, nb_chans,
                           big_endian, hcode, gd);

        } else {
            WriteACSectionPalette(rgba, x0, y0, xs, ys, stride, onegroup, hcode,
                                  lookup.data(), nb_chans, gd[0]);
        }
    };

    runner(
        runner_opaque, &run_one,
        +[](void *r, size_t i) { (*reinterpret_cast<decltype(&run_one)>(r))(i); },
        num_groups_x * num_groups_y);

    return frame_state;
}

JxlFastLosslessFrameState *JxlFastLosslessEncodeImpl(
    const unsigned char *rgba, size_t width, size_t stride, size_t height, size_t nb_chans,
    size_t bitdepth, bool big_endian, int effort, void *runner_opaque, FJxlParallelRunner runner)
{
    assert(bitdepth > 0);
    assert(nb_chans <= 4);
    assert(nb_chans != 0);

    if (bitdepth <= 8) {
        return LLEnc(rgba, width, stride, height, UpTo8Bits(bitdepth), nb_chans,
                     big_endian, effort, runner_opaque, runner);
    }
    if (bitdepth <= 13) {
        return LLEnc(rgba, width, stride, height, From9To13Bits(bitdepth), nb_chans,
                     big_endian, effort, runner_opaque, runner);
    }
    if (bitdepth == 14) {
        return LLEnc(rgba, width, stride, height, Exactly14Bits(bitdepth), nb_chans,
                     big_endian, effort, runner_opaque, runner);
    }
    return LLEnc(rgba, width, stride, height, MoreThan14Bits(bitdepth), nb_chans,
                 big_endian, effort, runner_opaque, runner);
}

} // namespace

#endif // FJXL_SELF_INCLUDE

extern "C" {

size_t JxlFastLosslessEncode(const unsigned char *rgba, size_t width, size_t row_stride, size_t height,
                             size_t nb_chans, size_t bitdepth, int big_endian, int effort,
                             unsigned char **output, void *runner_opaque, FJxlParallelRunner runner)
{
    auto frame_state = JxlFastLosslessPrepareFrame(
        rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
        runner_opaque, runner);

    JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
                                 /*is_last=*/1);
    size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
    *output = (unsigned char *)malloc(output_size);
    size_t written = 0;
    size_t total = 0;
    while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
                                                 output_size - total))
           != 0) {
        total += written;
    }
    return total;
}

JxlFastLosslessFrameState *JxlFastLosslessPrepareFrame(
    const unsigned char *rgba, size_t width, size_t row_stride, size_t height, size_t nb_chans, size_t bitdepth,
    int big_endian, int effort, void *runner_opaque, FJxlParallelRunner runner)
{
    auto trivial_runner = +[](void *, void *opaque, void fun(void *, size_t), size_t count) {
        for (size_t i = 0; i < count; i++) {
            fun(opaque, i);
        }
    };

    if (runner == nullptr) {
        runner = trivial_runner;
    }


    return JxlFastLosslessEncodeImpl(
        rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
        runner_opaque, runner);
}

} // extern "C"