Created
March 30, 2023 18:57
-
-
Save etemesi254/09f66aeab2d38b6b3ef358c3ec9a8a96 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) the JPEG XL Project Authors. All rights reserved. | |
// | |
// Use of this source code is governed by a BSD-style | |
// license that can be found in the LICENSE file. | |
#ifndef FJXL_SELF_INCLUDE | |
#define FJXL_SELF_INCLUDE | |
#include "enc_fast_lossless.h" | |
#include <assert.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <algorithm> | |
#include <array> | |
#include <limits> | |
#include <memory> | |
#include <vector> | |
namespace { | |
/// Determine what CPU architecture type we are in | |
#ifdef __BYTE_ORDER__ | |
#define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) | |
#else | |
#define CPU_IS_LITTLE_ENDIAN 1 | |
#endif | |
#define FJXL_INLINE | |
#if defined(_MSC_VER) && !defined(__clang__) | |
#include <intrin.h> | |
FJXL_INLINE uint32_t ff_log(uint32_t v) | |
{ | |
unsigned long index; | |
_BitScanReverse(&index, v); | |
return index; | |
} | |
// Compiles to a memcpy on little-endian systems. | |
FJXL_INLINE void StoreLe64(uint8_t *tgt, uint64_t data) | |
{ | |
if (CPU_IS_LITTLE_ENDIAN()) | |
memcpy(tgt, &data, 8); | |
else { | |
// swap and write | |
uint64_t val = _byteswap_uint64(data); | |
memcpy(tgt, &val, 8); | |
} | |
} | |
#else | |
FJXL_INLINE uint32_t ff_log(uint32_t v) | |
{ | |
return 31 - __builtin_clz(v|1); | |
} | |
// Compiles to a memcpy on little-endian systems. | |
FJXL_INLINE void StoreLe64(uint8_t *tgt, uint64_t data) | |
{ | |
if (CPU_IS_LITTLE_ENDIAN()) | |
memcpy(tgt, &data, 8); | |
else { | |
// swap and write | |
uint64_t val = __builtin_bswap64(data); | |
memcpy(tgt, &val, 8); | |
} | |
} | |
#endif | |
// Add bits to the byte buffer | |
// Returns the number of bytes in the buffer | |
// | |
// Parameters: | |
// nbits: Number of bits to add | |
// bits: Where we are going to pull bits from | |
// data_buf: Where we will write out buffer | |
// bits_in_buffer: Current number of bits in the buffer | |
// bit_buffer: buffer we are going to write to | |
FJXL_INLINE size_t AddBits(uint32_t nbits, uint64_t bits, uint8_t *data_buf, size_t &bits_in_buffer, uint64_t &bit_buffer) | |
{ | |
// add bytes | |
bit_buffer |= bits << bits_in_buffer; | |
bits_in_buffer += nbits; | |
StoreLe64(data_buf, bit_buffer); | |
size_t bytes_in_buffer = bits_in_buffer / 8; | |
bits_in_buffer -= bytes_in_buffer * 8; | |
bit_buffer >>= bytes_in_buffer * 8; | |
return bytes_in_buffer; | |
} | |
// A simple bit writer | |
// that writes in Little Endian | |
struct BitWriter { | |
size_t bytes_written = 0; | |
size_t bits_in_buffer = 0; | |
uint64_t buffer = 0; | |
std::unique_ptr<uint8_t[], void (*)(void *)> data = {nullptr, free}; | |
/// Allocate enough space to store maximum_bit_size bits | |
void Allocate(size_t maximum_bit_size) | |
{ | |
assert(data == nullptr); | |
// Leave some padding. | |
data.reset(static_cast<uint8_t *>(malloc(maximum_bit_size / 8 + 64))); | |
} | |
/// Add some bytes into this bif buffer and flush to the output | |
void Write(uint32_t count, uint64_t bits) | |
{ | |
bytes_written += AddBits(count, bits, data.get() + bytes_written, | |
bits_in_buffer, buffer); | |
} | |
/// Zero pad the bytes in the bit buffer | |
void ZeroPadToByte() | |
{ | |
if (bits_in_buffer != 0) { | |
Write(8 - bits_in_buffer, 0); | |
} | |
} | |
FJXL_INLINE void WriteMultiple(const uint64_t *nbits, const uint64_t *bits, size_t n) | |
{ | |
// Necessary because Write() is only guaranteed to work with <=56 bits. | |
// Trying to SIMD-fy this code results in lower speed (and definitely less | |
// clarity). | |
{ | |
for (size_t i = 0; i < n; i++) { | |
this->buffer |= bits[i] << this->bits_in_buffer; | |
memcpy(this->data.get() + this->bytes_written, &this->buffer, 8); | |
uint64_t shift = 64 - this->bits_in_buffer; | |
this->bits_in_buffer += nbits[i]; | |
// This `if` seems to be faster than using ternaries. | |
if (this->bits_in_buffer >= 64) { | |
uint64_t next_buffer = bits[i] >> shift; | |
this->buffer = next_buffer; | |
this->bits_in_buffer -= 64; | |
this->bytes_written += 8; | |
} | |
} | |
memcpy(this->data.get() + this->bytes_written, &this->buffer, 8); | |
size_t bytes_in_buffer = this->bits_in_buffer / 8; | |
this->bits_in_buffer -= bytes_in_buffer * 8; | |
this->buffer >>= bytes_in_buffer * 8; | |
this->bytes_written += bytes_in_buffer; | |
} | |
} | |
}; | |
} // namespace | |
extern "C" { | |
struct JxlFastLosslessFrameState { | |
// Width of the image | |
size_t width; | |
// Height of the image | |
size_t height; | |
// Number of channels present in image | |
size_t nb_chans; | |
// depth of image | |
size_t bitdepth; | |
// BitWriter used to write the header | |
BitWriter header; | |
std::vector<std::array<BitWriter, 4> > group_data; | |
size_t current_bit_writer = 0; | |
size_t bit_writer_byte_pos = 0; | |
size_t bits_in_buffer = 0; | |
uint64_t bit_buffer = 0; | |
}; | |
// Return the output in bytes of the frame we have written | |
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState *frame) | |
{ | |
size_t total_size_groups = 0; | |
for (size_t i = 0; i < frame->group_data.size(); i++) { | |
size_t sz = 0; | |
for (size_t j = 0; j < frame->nb_chans; j++) { | |
const auto &writer = frame->group_data[i][j]; | |
sz += writer.bytes_written * 8 + writer.bits_in_buffer; | |
} | |
sz = (sz + 7) / 8; | |
total_size_groups += sz; | |
} | |
return frame->header.bytes_written + total_size_groups; | |
} | |
/// Return the maxiumum required output size for the frame | |
size_t JxlFastLosslessMaxRequiredOutput( | |
const JxlFastLosslessFrameState *frame) | |
{ | |
return JxlFastLosslessOutputSize(frame) + 32; | |
} | |
void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState *frame, | |
int add_image_header, | |
int is_last) | |
{ | |
BitWriter *output = &frame->header; | |
output->Allocate(1000 + frame->group_data.size() * 32); | |
std::vector<size_t> group_sizes(frame->group_data.size()); | |
for (size_t i = 0; i < frame->group_data.size(); i++) { | |
size_t sz = 0; | |
for (size_t j = 0; j < frame->nb_chans; j++) { | |
const auto &writer = frame->group_data[i][j]; | |
sz += writer.bytes_written * 8 + writer.bits_in_buffer; | |
} | |
sz = (sz + 7) / 8; | |
group_sizes[i] = sz; | |
} | |
bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4); | |
if (add_image_header) { | |
// Signature | |
output->Write(16, 0x0AFF); | |
// Size header, hand-crafted. | |
// Not small | |
output->Write(1, 0); | |
auto wsz = [output](size_t size) { | |
if (size - 1 < (1 << 9)) { | |
output->Write(2, 0b00); | |
output->Write(9, size - 1); | |
} else if (size - 1 < (1 << 13)) { | |
output->Write(2, 0b01); | |
output->Write(13, size - 1); | |
} else if (size - 1 < (1 << 18)) { | |
output->Write(2, 0b10); | |
output->Write(18, size - 1); | |
} else { | |
output->Write(2, 0b11); | |
output->Write(30, size - 1); | |
} | |
}; | |
wsz(frame->height); | |
// No special ratio. | |
output->Write(3, 0); | |
wsz(frame->width); | |
// Hand-crafted ImageMetadata. | |
output->Write(1, 0); // all_default | |
output->Write(1, 0); // extra_fields | |
output->Write(1, 0); // bit_depth.floating_point_sample | |
if (frame->bitdepth == 8) { | |
output->Write(2, 0b00); // bit_depth.bits_per_sample = 8 | |
} else if (frame->bitdepth == 10) { | |
output->Write(2, 0b01); // bit_depth.bits_per_sample = 10 | |
} else if (frame->bitdepth == 12) { | |
output->Write(2, 0b10); // bit_depth.bits_per_sample = 12 | |
} else { | |
output->Write(2, 0b11); // 1 + u(6) | |
output->Write(6, frame->bitdepth - 1); | |
} | |
if (frame->bitdepth <= 14) { | |
output->Write(1, 1); // 16-bit-buffer sufficient | |
} else { | |
output->Write(1, 0); // 16-bit-buffer NOT sufficient | |
} | |
if (have_alpha) { | |
output->Write(2, 0b01); // One extra channel | |
output->Write(1, 1); // ... all_default (ie. 8-bit alpha) | |
} else { | |
output->Write(2, 0b00); // No extra channel | |
} | |
output->Write(1, 0); // Not XYB | |
if (frame->nb_chans > 1) { | |
output->Write(1, 1); // color_encoding.all_default (sRGB) | |
} else { | |
output->Write(1, 0); // color_encoding.all_default false | |
output->Write(1, 0); // color_encoding.want_icc false | |
output->Write(2, 1); // grayscale | |
output->Write(2, 1); // D65 | |
output->Write(1, 0); // no gamma transfer function | |
output->Write(2, 0b10); // tf: 2 + u(4) | |
output->Write(4, 11); // tf of sRGB | |
output->Write(2, 1); // relative rendering intent | |
} | |
output->Write(2, 0b00); // No extensions. | |
output->Write(1, 1); // all_default transform data | |
// No ICC, no preview. Frame should start at byte boundery. | |
output->ZeroPadToByte(); | |
} | |
// Handcrafted frame header. | |
output->Write(1, 0); // all_default | |
output->Write(2, 0b00); // regular frame | |
output->Write(1, 1); // modular | |
output->Write(2, 0b00); // default flags | |
output->Write(1, 0); // not YCbCr | |
output->Write(2, 0b00); // no upsampling | |
if (have_alpha) { | |
output->Write(2, 0b00); // no alpha upsampling | |
} | |
output->Write(2, 0b01); // default group size | |
output->Write(2, 0b00); // exactly one pass | |
output->Write(1, 0); // no custom size or origin | |
output->Write(2, 0b00); // kReplace blending mode | |
if (have_alpha) { | |
output->Write(2, 0b00); // kReplace blending mode for alpha channel | |
} | |
output->Write(1, is_last); // is_last | |
output->Write(2, 0b00); // a frame has no name | |
output->Write(1, 0); // loop filter is not all_default | |
output->Write(1, 0); // no gaborish | |
output->Write(2, 0); // 0 EPF iters | |
output->Write(2, 0b00); // No LF extensions | |
output->Write(2, 0b00); // No FH extensions | |
output->Write(1, 0); // No TOC permutation | |
output->ZeroPadToByte(); // TOC is byte-aligned. | |
for (size_t i = 0; i < frame->group_data.size(); i++) { | |
size_t sz = group_sizes[i]; | |
if (sz < (1 << 10)) { | |
output->Write(2, 0b00); | |
output->Write(10, sz); | |
} else if (sz - 1024 < (1 << 14)) { | |
output->Write(2, 0b01); | |
output->Write(14, sz - 1024); | |
} else if (sz - 17408 < (1 << 22)) { | |
output->Write(2, 0b10); | |
output->Write(22, sz - 17408); | |
} else { | |
output->Write(2, 0b11); | |
output->Write(30, sz - 4211712); | |
} | |
} | |
output->ZeroPadToByte(); // Groups are byte-aligned. | |
} | |
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState *frame, | |
unsigned char *output, | |
size_t output_size) | |
{ | |
assert(output_size >= 32); | |
unsigned char *initial_output = output; | |
size_t (*append_bytes_with_bit_offset)(const uint8_t *, size_t, size_t, | |
unsigned char *, uint64_t &) | |
= nullptr; | |
while (true) { | |
size_t &cur = frame->current_bit_writer; | |
size_t &bw_pos = frame->bit_writer_byte_pos; | |
if (cur >= 1 + frame->group_data.size() * frame->nb_chans) { | |
return output - initial_output; | |
} | |
if (output_size <= 8) { | |
return output - initial_output; | |
} | |
size_t nbc = frame->nb_chans; | |
const BitWriter &writer = cur == 0 ? frame->header : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc]; | |
size_t full_byte_count = std::min(output_size - 8, writer.bytes_written - bw_pos); | |
if (frame->bits_in_buffer == 0) { | |
memcpy(output, writer.data.get() + bw_pos, full_byte_count); | |
} else { | |
size_t i = 0; | |
if (append_bytes_with_bit_offset) { | |
i += append_bytes_with_bit_offset( | |
writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer, | |
output, frame->bit_buffer); | |
} | |
for (; i < full_byte_count; i++) { | |
AddBits(8, writer.data.get()[bw_pos + i], output + i, | |
frame->bits_in_buffer, frame->bit_buffer); | |
} | |
} | |
output += full_byte_count; | |
output_size -= full_byte_count; | |
bw_pos += full_byte_count; | |
if (bw_pos == writer.bytes_written) { | |
auto write = [&](size_t num, uint64_t bits) { | |
size_t n = AddBits(num, bits, output, frame->bits_in_buffer, | |
frame->bit_buffer); | |
output += n; | |
output_size -= n; | |
}; | |
if (writer.bits_in_buffer) { | |
write(writer.bits_in_buffer, writer.buffer); | |
} | |
bw_pos = 0; | |
cur++; | |
if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) { | |
write(8 - frame->bits_in_buffer, 0); | |
} | |
} | |
} | |
} | |
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState *frame) | |
{ | |
delete frame; | |
} | |
} // extern "C" | |
#endif | |
#ifdef FJXL_SELF_INCLUDE | |
namespace { | |
constexpr size_t kNumRawSymbols = 19; | |
constexpr size_t kNumLZ77 = 33; | |
constexpr size_t kLZ77CacheSize = 32; | |
constexpr size_t kLZ77Offset = 224; | |
constexpr size_t kLZ77MinLength = 7; | |
void EncodeHybridUintLZ77(uint32_t value, uint32_t *token, uint32_t *nbits, uint32_t *bits) | |
{ | |
// 400 config | |
uint32_t n = ff_log(value); | |
*token = value < 16 ? value : 16 + n - 4; | |
*nbits = value < 16 ? 0 : n; | |
*bits = value < 16 ? 0 : value - (1 << *nbits); | |
} | |
struct PrefixCode { | |
uint8_t raw_nbits[kNumRawSymbols] = {}; | |
uint8_t raw_bits[kNumRawSymbols] = {}; | |
alignas(64) uint8_t raw_nbits_simd[16] = {}; | |
alignas(64) uint8_t raw_bits_simd[16] = {}; | |
uint8_t lz77_nbits[kNumLZ77] = {}; | |
uint16_t lz77_bits[kNumLZ77] = {}; | |
uint64_t lz77_cache_bits[kLZ77CacheSize] = {}; | |
uint8_t lz77_cache_nbits[kLZ77CacheSize] = {}; | |
static constexpr size_t kMaxNumSymbols = kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1; | |
static uint16_t BitReverse(size_t nbits, uint16_t bits) | |
{ | |
constexpr uint16_t kNibbleLookup[16] = { | |
0b0000,0b1000,0b0100,0b1100, | |
0b0010,0b1010,0b0110,0b1110, | |
0b0001,0b1001,0b0101,0b1101, | |
0b0011,0b1011,0b0111,0b1111, | |
}; | |
uint16_t rev16 = (kNibbleLookup[(bits >> 0) & 0xF] << 12) | |
| (kNibbleLookup[(bits >> 4) & 0xF] << 8) | |
| (kNibbleLookup[(bits >> 8) & 0xF] << 4) | |
| (kNibbleLookup[(bits >> 12) & 0XF] << 0); | |
return rev16 >> (16 - nbits); | |
} | |
// Create the prefix codes given the code lengths. | |
// Supports the code lengths being split into two halves. | |
static void ComputeCanonicalCode(const uint8_t *first_chunk_nbits, | |
uint8_t *first_chunk_bits, | |
size_t first_chunk_size, | |
const uint8_t *second_chunk_nbits, | |
uint16_t *second_chunk_bits, | |
size_t second_chunk_size) | |
{ | |
constexpr size_t kMaxCodeLength = 15; | |
uint8_t code_length_counts[kMaxCodeLength + 1] = {}; | |
for (size_t i = 0; i < first_chunk_size; i++) { | |
code_length_counts[first_chunk_nbits[i]]++; | |
assert(first_chunk_nbits[i] <= kMaxCodeLength); | |
assert(first_chunk_nbits[i] <= 8); | |
assert(first_chunk_nbits[i] > 0); | |
} | |
for (size_t i = 0; i < second_chunk_size; i++) { | |
code_length_counts[second_chunk_nbits[i]]++; | |
assert(second_chunk_nbits[i] <= kMaxCodeLength); | |
} | |
uint16_t next_code[kMaxCodeLength + 1] = {}; | |
uint16_t code = 0; | |
for (size_t i = 1; i < kMaxCodeLength + 1; i++) { | |
code = (code + code_length_counts[i - 1]) << 1; | |
next_code[i] = code; | |
} | |
for (size_t i = 0; i < first_chunk_size; i++) { | |
first_chunk_bits[i] = BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++); | |
} | |
for (size_t i = 0; i < second_chunk_size; i++) { | |
second_chunk_bits[i] = BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++); | |
} | |
} | |
static void ComputeCodeLengthsNonZeroImpl(const uint64_t *freqs, size_t n, size_t precision, uint64_t infty, uint8_t *min_limit, uint8_t *max_limit, uint8_t *nbits) | |
{ | |
std::vector<uint64_t> dynp(((1U << precision) + 1) * (n + 1), infty); | |
auto d = [&](size_t sym, size_t off) -> uint64_t & { | |
return dynp[sym * ((1 << precision) + 1) + off]; | |
}; | |
d(0, 0) = 0; | |
for (size_t sym = 0; sym < n; sym++) { | |
for (uint64_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { | |
size_t off_delta = 1U << (precision - bits); | |
for (size_t off = 0; off + off_delta <= (1U << precision); off++) { | |
d(sym + 1, off + off_delta) = std::min(d(sym, off) + static_cast<uint64_t>(freqs[sym]) * bits, | |
d(sym + 1, off + off_delta)); | |
} | |
} | |
} | |
size_t sym = n; | |
size_t off = 1U << precision; | |
assert(d(sym, off) != infty); | |
while (sym-- > 0) { | |
assert(off > 0); | |
for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { | |
size_t off_delta = 1U << (precision - bits); | |
if (off_delta <= off && d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) { | |
off -= off_delta; | |
nbits[sym] = bits; | |
break; | |
} | |
} | |
} | |
} | |
// Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <= | |
// max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] * | |
// freqs[i]). | |
static void ComputeCodeLengthsNonZero(const uint64_t *freqs, size_t n, uint8_t *min_limit, uint8_t *max_limit, uint8_t *nbits) | |
{ | |
size_t precision = 0; | |
size_t shortest_length = 255; | |
uint64_t freqsum = 0; | |
for (size_t i = 0; i < n; i++) { | |
assert(freqs[i] != 0); | |
freqsum += freqs[i]; | |
if (min_limit[i] < 1) | |
min_limit[i] = 1; | |
assert(min_limit[i] <= max_limit[i]); | |
precision = std::max<size_t>(max_limit[i], precision); | |
shortest_length = std::min<size_t>(min_limit[i], shortest_length); | |
} | |
// If all the minimum limits are greater than 1, shift precision so that we | |
// behave as if the shortest was 1. | |
precision -= shortest_length - 1; | |
uint64_t infty = freqsum * precision; | |
ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit, | |
max_limit, nbits); | |
} | |
static void ComputeCodeLengths(const uint64_t *freqs, size_t n, const uint8_t *min_limit_in, const uint8_t *max_limit_in, uint8_t *nbits) | |
{ | |
assert(n <= kMaxNumSymbols); | |
uint64_t compact_freqs[kMaxNumSymbols]; | |
uint8_t min_limit[kMaxNumSymbols]; | |
uint8_t max_limit[kMaxNumSymbols]; | |
size_t ni = 0; | |
for (size_t i = 0; i < n; i++) { | |
if (freqs[i]) { | |
compact_freqs[ni] = freqs[i]; | |
min_limit[ni] = min_limit_in[i]; | |
max_limit[ni] = max_limit_in[i]; | |
ni++; | |
} | |
} | |
uint8_t num_bits[kMaxNumSymbols] = {}; | |
ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit, | |
num_bits); | |
ni = 0; | |
for (size_t i = 0; i < n; i++) { | |
nbits[i] = 0; | |
if (freqs[i]) { | |
nbits[i] = num_bits[ni++]; | |
} | |
} | |
} | |
// Invalid code, used to construct arrays. | |
PrefixCode() {} | |
template <typename BitDepth> | |
PrefixCode(BitDepth, uint64_t *raw_counts, uint64_t *lz77_counts) | |
{ | |
// "merge" together all the lz77 counts in a single symbol for the level 1 | |
// table (containing just the raw symbols, up to length 7). | |
uint64_t level1_counts[kNumRawSymbols + 1]; | |
memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t)); | |
size_t numraw = kNumRawSymbols; | |
while (numraw > 0 && level1_counts[numraw - 1] == 0) | |
numraw--; | |
level1_counts[numraw] = 0; | |
for (size_t i = 0; i < kNumLZ77; i++) { | |
level1_counts[numraw] += lz77_counts[i]; | |
} | |
uint8_t level1_nbits[kNumRawSymbols + 1] = {}; | |
ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength, | |
BitDepth::kMaxRawLength, level1_nbits); | |
uint8_t level2_nbits[kNumLZ77] = {}; | |
uint8_t min_lengths[kNumLZ77] = {}; | |
uint8_t l = 15 - level1_nbits[numraw]; | |
uint8_t max_lengths[kNumLZ77]; | |
for (size_t i = 0; i < kNumLZ77; i++) { | |
max_lengths[i] = l; | |
} | |
size_t num_lz77 = kNumLZ77; | |
while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) | |
num_lz77--; | |
ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths, | |
level2_nbits); | |
for (size_t i = 0; i < numraw; i++) { | |
raw_nbits[i] = level1_nbits[i]; | |
} | |
for (size_t i = 0; i < num_lz77; i++) { | |
lz77_nbits[i] = level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0; | |
} | |
ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits, | |
kNumLZ77); | |
// Prepare lz77 cache | |
for (size_t count = 0; count < kLZ77CacheSize; count++) { | |
unsigned token, nbits, bits; | |
EncodeHybridUintLZ77(count, &token, &nbits, &bits); | |
lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0]; | |
lz77_cache_bits[count] = (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) | raw_bits[0]; | |
} | |
} | |
void WriteTo(BitWriter *writer) const | |
{ | |
uint64_t code_length_counts[18] = {}; | |
code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1); | |
for (size_t i = 0; i < kNumRawSymbols; i++) { | |
code_length_counts[raw_nbits[i]]++; | |
} | |
for (size_t i = 0; i < kNumLZ77; i++) { | |
code_length_counts[lz77_nbits[i]]++; | |
} | |
uint8_t code_length_nbits[18] = {}; | |
uint8_t code_length_nbits_min[18] = {}; | |
uint8_t code_length_nbits_max[18] = { | |
5,5,5,5,5,5, | |
5,5,5,5,5,5, | |
5,5,5,5,5,5, | |
}; | |
ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min, | |
code_length_nbits_max, code_length_nbits); | |
writer->Write(2, 0b00); // HSKIP = 0, i.e. don't skip code lengths. | |
// As per Brotli RFC. | |
uint8_t code_length_order[18] = {1, 2, 3, 4, 0, 5, 17, 6, 16, | |
7, 8, 9, 10, 11, 12, 13, 14, 15}; | |
uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4}; | |
uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15}; | |
// Encode lengths of code lengths. | |
size_t num_code_lengths = 18; | |
while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) { | |
num_code_lengths--; | |
} | |
for (size_t i = 0; i < num_code_lengths; i++) { | |
int symbol = code_length_nbits[code_length_order[i]]; | |
writer->Write(code_length_length_nbits[symbol], | |
code_length_length_bits[symbol]); | |
} | |
// Compute the canonical codes for the codes that represent the lengths of | |
// the actual codes for data. | |
uint16_t code_length_bits[18] = {}; | |
ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits, | |
code_length_bits, 18); | |
// Encode raw bit code lengths. | |
for (size_t i = 0; i < kNumRawSymbols; i++) { | |
writer->Write(code_length_nbits[raw_nbits[i]], | |
code_length_bits[raw_nbits[i]]); | |
} | |
size_t num_lz77 = kNumLZ77; | |
while (lz77_nbits[num_lz77 - 1] == 0) { | |
num_lz77--; | |
} | |
// Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 = | |
// 205. | |
static_assert(kLZ77Offset == 224); | |
static_assert(kNumRawSymbols == 19); | |
writer->Write(code_length_nbits[17], code_length_bits[17]); | |
writer->Write(3, 0b010); // 5 | |
writer->Write(code_length_nbits[17], code_length_bits[17]); | |
writer->Write(3, 0b000); // (5-2)*8 + 3 = 27 | |
writer->Write(code_length_nbits[17], code_length_bits[17]); | |
writer->Write(3, 0b010); // (27-2)*8 + 5 = 205 | |
// Encode LZ77 symbols, with values 224+i. | |
for (size_t i = 0; i < num_lz77; i++) { | |
writer->Write(code_length_nbits[lz77_nbits[i]], | |
code_length_bits[lz77_nbits[i]]); | |
} | |
} | |
}; | |
void EncodeHybridUint000(uint32_t value, uint32_t *token, uint32_t *nbits, uint32_t *bits) | |
{ | |
uint32_t n = ff_log(value); | |
*token = value ? n + 1 : 0; | |
*nbits = value ? n : 0; | |
*bits = value ? value - (1 << n) : 0; | |
} | |
constexpr static size_t kLogChunkSize = 3; | |
constexpr static size_t kChunkSize = 1 << kLogChunkSize; | |
template <typename Residual> | |
void GenericEncodeChunk(const Residual *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output) | |
{ | |
for (size_t ix = skip; ix < n; ix++) { | |
unsigned token, nbits, bits; | |
EncodeHybridUint000(residuals[ix], &token, &nbits, &bits); | |
output.Write(code.raw_nbits[token] + nbits, | |
code.raw_bits[token] | bits << code.raw_nbits[token]); | |
} | |
} | |
struct UpTo8Bits { | |
size_t bitdepth; | |
explicit UpTo8Bits(size_t bitdepth) | |
: bitdepth(bitdepth) | |
{ | |
assert(bitdepth <= 8); | |
} | |
// Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other | |
// symbols, we could actually go up to 8 Huffman bits as we have at most 8 | |
// extra bits; however, the SIMD bit merging logic for AVX2 assumes that no | |
// Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for | |
// LZ77 lengths and has no limitations except allowing to represent 32 symbols | |
// in total. | |
static constexpr uint8_t kMinRawLength[12] = {}; | |
static constexpr uint8_t kMaxRawLength[12] = { | |
7,7,7,7,7,7, | |
7,7,7,7,7,10 | |
}; | |
static size_t MaxEncodedBitsPerSample() { return 16; } | |
static constexpr size_t kInputBytes = 1; | |
using pixel_t = int16_t; | |
using upixel_t = uint16_t; | |
static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output) | |
{ | |
GenericEncodeChunk(residuals, n, skip, code, output); | |
} | |
size_t NumSymbols(bool doing_ycocg) const | |
{ | |
// values gain 1 bit for YCoCg, 1 bit for prediction. | |
// Maximum symbol is 1 + effective bit depth of residuals. | |
if (doing_ycocg) { | |
return bitdepth + 3; | |
} else { | |
return bitdepth + 2; | |
} | |
} | |
}; | |
constexpr uint8_t UpTo8Bits::kMinRawLength[]; | |
constexpr uint8_t UpTo8Bits::kMaxRawLength[]; | |
struct From9To13Bits { | |
size_t bitdepth; | |
explicit From9To13Bits(size_t bitdepth) | |
: bitdepth(bitdepth) | |
{ | |
assert(bitdepth <= 13 && bitdepth >= 9); | |
} | |
// Last symbol is used for LZ77 lengths and has no limitations except allowing | |
// to represent 32 symbols in total. | |
// We cannot fit all the bits in a u16, so do not even try and use up to 8 | |
// bits per raw symbol. | |
// There are at most 16 raw symbols, so Huffman coding can be SIMDfied without | |
// any special tricks. | |
static constexpr uint8_t kMinRawLength[17] = {}; | |
static constexpr uint8_t kMaxRawLength[17] = { | |
8,8,8,8,8,8,8,8, | |
8,8,8,8,8,8,8,8, | |
10 | |
}; | |
static size_t MaxEncodedBitsPerSample() { return 21; } | |
static constexpr size_t kInputBytes = 2; | |
using pixel_t = int16_t; | |
using upixel_t = uint16_t; | |
static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output) | |
{ | |
GenericEncodeChunk(residuals, n, skip, code, output); | |
} | |
size_t NumSymbols(bool doing_ycocg) const | |
{ | |
// values gain 1 bit for YCoCg, 1 bit for prediction. | |
// Maximum symbol is 1 + effective bit depth of residuals. | |
if (doing_ycocg) { | |
return bitdepth + 3; | |
} else { | |
return bitdepth + 2; | |
} | |
} | |
}; | |
constexpr uint8_t From9To13Bits::kMinRawLength[]; | |
constexpr uint8_t From9To13Bits::kMaxRawLength[]; | |
void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) | |
{ | |
assert(nbits1 == 8); | |
assert(nbits2 == 8); | |
assert(bits2 == (bits1 | 128)); | |
} | |
struct Exactly14Bits { | |
explicit Exactly14Bits(size_t bitdepth) { assert(bitdepth == 14); } | |
// Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to | |
// have exactly 8, and no other symbol to have 8 or more. This ensures that | |
// the representation for 15 and 16 is identical up to one bit. | |
static constexpr uint8_t kMinRawLength[18] = { | |
0,0,0,0,0,0, | |
0,0,0,0,0,0, | |
0,0,0,8,8,7, | |
}; | |
static constexpr uint8_t kMaxRawLength[18] = { | |
7,7,7,7,7, | |
7,7,7,7,7, | |
7,7,7,7,7, | |
8,8,10, | |
}; | |
static constexpr size_t bitdepth = 14; | |
static size_t MaxEncodedBitsPerSample() { return 22; } | |
static constexpr size_t kInputBytes = 2; | |
using pixel_t = int16_t; | |
using upixel_t = uint16_t; | |
static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output) | |
{ | |
GenericEncodeChunk(residuals, n, skip, code, output); | |
} | |
size_t NumSymbols(bool) const { return 17; } | |
}; | |
constexpr uint8_t Exactly14Bits::kMinRawLength[]; | |
constexpr uint8_t Exactly14Bits::kMaxRawLength[]; | |
struct MoreThan14Bits { | |
size_t bitdepth; | |
explicit MoreThan14Bits(size_t bitdepth) | |
: bitdepth(bitdepth) | |
{ | |
assert(bitdepth > 14); | |
assert(bitdepth <= 16); | |
} | |
// Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to | |
// have exactly 8, and no other symbol to have 8 or more. This ensures that | |
// the representation for (13, 14), (15, 16), (17, 18) is identical up to one | |
// bit. | |
static constexpr uint8_t kMinRawLength[20] = { | |
0,0,0,0,0, | |
0,0,0,0,0, | |
0,0,0,8,8, | |
8,8,8,8,7, | |
}; | |
static constexpr uint8_t kMaxRawLength[20] = { | |
7,7,7,7,7, | |
7,7,7,7,7, | |
7,7,7,8,8, | |
8,8,8,8,10 | |
}; | |
static size_t MaxEncodedBitsPerSample() { return 24; } | |
static constexpr size_t kInputBytes = 2; | |
using pixel_t = int32_t; | |
using upixel_t = uint32_t; | |
static void EncodeChunk(upixel_t *residuals, size_t n, size_t skip, const PrefixCode &code, BitWriter &output) | |
{ | |
GenericEncodeChunk(residuals, n, skip, code, output); | |
} | |
size_t NumSymbols(bool) const { return 19; } | |
}; | |
constexpr uint8_t MoreThan14Bits::kMinRawLength[]; | |
constexpr uint8_t MoreThan14Bits::kMaxRawLength[]; | |
void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height, const PrefixCode code[4], BitWriter *output) | |
{ | |
output->Allocate(100000 + (is_single_group ? width * height * 16 : 0)); | |
// No patches, spline or noise. | |
output->Write(1, 1); // default DC dequantization factors (?) | |
output->Write(1, 1); // use global tree / histograms | |
output->Write(1, 0); // no lz77 for the tree | |
output->Write(1, 1); // simple code for the tree's context map | |
output->Write(2, 0); // all contexts clustered together | |
output->Write(1, 1); // use prefix code for tree | |
output->Write(4, 0); // 000 hybrid uint | |
output->Write(6, 0b100011); // Alphabet size is 4 (var16) | |
output->Write(2, 1); // simple prefix code | |
output->Write(2, 3); // with 4 symbols | |
output->Write(2, 0); | |
output->Write(2, 1); | |
output->Write(2, 2); | |
output->Write(2, 3); | |
output->Write(1, 0); // First tree encoding option | |
// Huffman table + extra bits for the tree. | |
uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111}; | |
uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4}; | |
// Write a tree with a leaf per channel, and gradient predictor for every | |
// leaf. | |
for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5, | |
0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) { | |
output->Write(symbol_nbits[v], symbol_bits[v]); | |
} | |
output->Write(1, 1); // Enable lz77 for the main bitstream | |
output->Write(2, 0b00); // lz77 offset 224 | |
static_assert(kLZ77Offset == 224, ""); | |
output->Write(4, 0b1010); // lz77 min length 7 | |
// 400 hybrid uint config for lz77 | |
output->Write(4, 4); | |
output->Write(3, 0); | |
output->Write(3, 0); | |
output->Write(1, 1); // simple code for the context map | |
output->Write(2, 3); // 3 bits per entry | |
output->Write(3, 4); // channel 3 | |
output->Write(3, 3); // channel 2 | |
output->Write(3, 2); // channel 1 | |
output->Write(3, 1); // channel 0 | |
output->Write(3, 0); // distance histogram first | |
output->Write(1, 1); // use prefix codes | |
output->Write(4, 0); // 000 hybrid uint config for distances (only need 0) | |
for (size_t i = 0; i < 4; i++) { | |
output->Write(4, 0); // 000 hybrid uint config for symbols (only <= 10) | |
} | |
// Distance alphabet size: | |
output->Write(5, 0b00001); // 2: just need 1 for RLE (i.e. distance 1) | |
// Symbol + LZ77 alphabet size: | |
for (size_t i = 0; i < 4; i++) { | |
output->Write(1, 1); // > 1 | |
output->Write(4, 8); // <= 512 | |
output->Write(8, 256); // == 512 | |
} | |
// Distance histogram: | |
output->Write(2, 1); // simple prefix code | |
output->Write(2, 0); // with one symbol | |
output->Write(1, 1); // 1 | |
// Symbol + lz77 histogram: | |
for (size_t i = 0; i < 4; i++) { | |
code[i].WriteTo(output); | |
} | |
// Group header for global modular image. | |
output->Write(1, 1); // Global tree | |
output->Write(1, 1); // All default wp | |
} | |
void PrepareDCGlobal(bool is_single_group, size_t width, size_t height, size_t nb_chans, const PrefixCode code[4], BitWriter *output) | |
{ | |
PrepareDCGlobalCommon(is_single_group, width, height, code, output); | |
if (nb_chans > 2) { | |
output->Write(2, 0b01); // 1 transform | |
output->Write(2, 0b00); // RCT | |
output->Write(5, 0b00000); // Starting from ch 0 | |
output->Write(2, 0b00); // YCoCg | |
} else { | |
output->Write(2, 0b00); // no transforms | |
} | |
if (!is_single_group) { | |
output->ZeroPadToByte(); | |
} | |
} | |
template <typename BitDepth> | |
struct ChunkEncoder { | |
FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode &code, BitWriter &output) | |
{ | |
if (count == 0) | |
return; | |
count -= kLZ77MinLength + 1; | |
if (count < kLZ77CacheSize) { | |
output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]); | |
} else { | |
unsigned token, nbits, bits; | |
EncodeHybridUintLZ77(count, &token, &nbits, &bits); | |
uint64_t wbits = bits; | |
wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token]; | |
wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0]; | |
output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits); | |
} | |
} | |
FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t *residuals, size_t skip, size_t n) | |
{ | |
EncodeRle(run, *code, *output); | |
BitDepth::EncodeChunk(residuals, n, skip, *code, *output); | |
} | |
inline void Finalize(size_t run) { EncodeRle(run, *code, *output); } | |
const PrefixCode *code; | |
BitWriter *output; | |
}; | |
template <typename BitDepth> | |
struct ChunkSampleCollector { | |
uint64_t *raw_counts; | |
uint64_t *lz77_counts; | |
FJXL_INLINE void Rle(size_t count) | |
{ | |
if (count == 0) | |
return; | |
raw_counts[0] += 1; | |
count -= kLZ77MinLength + 1; | |
unsigned token, nbits, bits; | |
EncodeHybridUintLZ77(count, &token, &nbits, &bits); | |
lz77_counts[token]++; | |
} | |
FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t *residuals, size_t skip, size_t n) | |
{ | |
// Run is broken. Encode the run and encode the individual vector. | |
Rle(run); | |
for (size_t ix = skip; ix < n; ix++) { | |
unsigned token, nbits, bits; | |
EncodeHybridUint000(residuals[ix], &token, &nbits, &bits); | |
raw_counts[token]++; | |
} | |
} | |
// don't count final run since we don't know how long it really is | |
void Finalize(size_t run) {} | |
}; | |
constexpr uint32_t PackSigned(int32_t value) | |
{ | |
return (static_cast<uint32_t>(value) << 1) ^ ((static_cast<uint32_t>(~value) >> 31) - 1); | |
} | |
template <typename T, typename BitDepth> | |
struct ChannelRowProcessor { | |
// Invariant: run == 0 or run > kLZ77MinLength. | |
size_t run = 0; | |
using upixel_t = typename BitDepth::upixel_t; | |
using pixel_t = typename BitDepth::pixel_t; | |
T t; | |
void ProcessChunk(const pixel_t *row, const pixel_t *row_left, const pixel_t *row_top, const pixel_t *row_topleft, size_t n) | |
{ | |
alignas(64) upixel_t residuals[kChunkSize] = {}; | |
size_t prefix_size = 0; | |
size_t required_prefix_size = 0; | |
for (size_t ix = 0; ix < kChunkSize; ix++) { | |
pixel_t px = row[ix]; | |
pixel_t left = row_left[ix]; | |
pixel_t top = row_top[ix]; | |
pixel_t topleft = row_topleft[ix]; | |
pixel_t ac = left - topleft; | |
pixel_t ab = left - top; | |
pixel_t bc = top - topleft; | |
pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) + static_cast<upixel_t>(top)); | |
pixel_t d = ab ^ bc; | |
pixel_t clamp = d < 0 ? top : left; | |
pixel_t s = ac ^ bc; | |
pixel_t pred = s < 0 ? grad : clamp; | |
residuals[ix] = PackSigned(px - pred); | |
prefix_size = prefix_size == required_prefix_size ? prefix_size + (residuals[ix] == 0) : prefix_size; | |
required_prefix_size += 1; | |
} | |
prefix_size = std::min(n, prefix_size); | |
if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) { | |
// Run continues, nothing to do. | |
run += prefix_size; | |
} else if (prefix_size + run > kLZ77MinLength) { | |
// Run is broken. Encode the run and encode the individual vector. | |
t.Chunk(run + prefix_size, residuals, prefix_size, n); | |
run = 0; | |
} else { | |
// There was no run to begin with. | |
t.Chunk(0, residuals, 0, n); | |
} | |
} | |
void ProcessRow(const pixel_t *row, const pixel_t *row_left, const pixel_t *row_top, const pixel_t *row_topleft, size_t xs) | |
{ | |
for (size_t x = 0; x < xs; x += kChunkSize) { | |
ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x, | |
std::min(kChunkSize, xs - x)); | |
} | |
} | |
void Finalize() { | |
t.Finalize(this->run); | |
} | |
}; | |
uint16_t LoadLE16(const unsigned char *ptr) | |
{ | |
return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8); | |
} | |
uint16_t SwapEndian(uint16_t in) | |
{ | |
return (in >> 8) | (in << 8); | |
} | |
template <typename pixel_t> | |
void FillRowG8(const unsigned char *rgba, size_t oxs, pixel_t *luma) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
luma[x] = rgba[x]; | |
} | |
} | |
template <bool big_endian, typename pixel_t> | |
void FillRowG16(const unsigned char *rgba, size_t oxs, pixel_t *luma) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
uint16_t val = LoadLE16(rgba + 2 * x); | |
if (big_endian) { | |
val = SwapEndian(val); | |
} | |
luma[x] = val; | |
} | |
} | |
template <typename pixel_t> | |
void FillRowGA8(const unsigned char *rgba, size_t oxs, pixel_t *luma, pixel_t *alpha) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
luma[x] = rgba[2 * x]; | |
alpha[x] = rgba[2 * x + 1]; | |
} | |
} | |
template <bool big_endian, typename pixel_t> | |
void FillRowGA16(const unsigned char *rgba, size_t oxs, pixel_t *luma, pixel_t *alpha) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
uint16_t l = LoadLE16(rgba + 4 * x); | |
uint16_t a = LoadLE16(rgba + 4 * x + 2); | |
if (big_endian) { | |
l = SwapEndian(l); | |
a = SwapEndian(a); | |
} | |
luma[x] = l; | |
alpha[x] = a; | |
} | |
} | |
template <typename pixel_t> | |
void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t *y, pixel_t *co, pixel_t *cg) | |
{ | |
*co = r - b; | |
pixel_t tmp = b + (*co >> 1); | |
*cg = g - tmp; | |
*y = tmp + (*cg >> 1); | |
} | |
template <typename pixel_t> | |
void FillRowRGB8(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
uint16_t r = rgba[3 * x]; | |
uint16_t g = rgba[3 * x + 1]; | |
uint16_t b = rgba[3 * x + 2]; | |
StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); | |
} | |
} | |
template <bool big_endian, typename pixel_t> | |
void FillRowRGB16(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
uint16_t r = LoadLE16(rgba + 6 * x); | |
uint16_t g = LoadLE16(rgba + 6 * x + 2); | |
uint16_t b = LoadLE16(rgba + 6 * x + 4); | |
if (big_endian) { | |
r = SwapEndian(r); | |
g = SwapEndian(g); | |
b = SwapEndian(b); | |
} | |
StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); | |
} | |
} | |
template <typename pixel_t> | |
void FillRowRGBA8(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg, pixel_t *alpha) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
uint16_t r = rgba[4 * x]; | |
uint16_t g = rgba[4 * x + 1]; | |
uint16_t b = rgba[4 * x + 2]; | |
uint16_t a = rgba[4 * x + 3]; | |
StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); | |
alpha[x] = a; | |
} | |
} | |
template <bool big_endian, typename pixel_t> | |
void FillRowRGBA16(const unsigned char *rgba, size_t oxs, pixel_t *y, pixel_t *co, pixel_t *cg, pixel_t *alpha) | |
{ | |
size_t x = 0; | |
for (; x < oxs; x++) { | |
uint16_t r = LoadLE16(rgba + 8 * x); | |
uint16_t g = LoadLE16(rgba + 8 * x + 2); | |
uint16_t b = LoadLE16(rgba + 8 * x + 4); | |
uint16_t a = LoadLE16(rgba + 8 * x + 6); | |
if (big_endian) { | |
r = SwapEndian(r); | |
g = SwapEndian(g); | |
b = SwapEndian(b); | |
a = SwapEndian(a); | |
} | |
StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x); | |
alpha[x] = a; | |
} | |
} | |
template <typename Processor, typename BitDepth> | |
void ProcessImageArea(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t yskip, size_t ys, size_t row_stride, BitDepth bitdepth, size_t nb_chans, bool big_endian, Processor *processors) | |
{ | |
using pixel_t = typename BitDepth::pixel_t; | |
constexpr size_t kPadding = 32; | |
constexpr size_t kAlign = 64; | |
constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t); | |
constexpr size_t kNumPx = (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels * kAlignPixels; | |
auto align = [=](pixel_t *ptr) { | |
size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign; | |
if (offset) { | |
ptr += offset / sizeof(pixel_t); | |
} | |
return ptr; | |
}; | |
std::vector<std::array<std::array<pixel_t, kNumPx>, 2> > group_data(nb_chans); | |
for (size_t y = 0; y < ys; y++) { | |
const auto rgba_row = rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes; | |
pixel_t *crow[4] = {}; | |
pixel_t *prow[4] = {}; | |
for (size_t i = 0; i < nb_chans; i++) { | |
crow[i] = align(&group_data[i][y & 1][kPadding]); | |
prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]); | |
} | |
// Pre-fill rows with YCoCg converted pixels. | |
if (nb_chans == 1) { | |
if (BitDepth::kInputBytes == 1) { | |
FillRowG8(rgba_row, xs, crow[0]); | |
} else if (big_endian) { | |
FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]); | |
} else { | |
FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]); | |
} | |
} else if (nb_chans == 2) { | |
if (BitDepth::kInputBytes == 1) { | |
FillRowGA8(rgba_row, xs, crow[0], crow[1]); | |
} else if (big_endian) { | |
FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]); | |
} else { | |
FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]); | |
} | |
} else if (nb_chans == 3) { | |
if (BitDepth::kInputBytes == 1) { | |
FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]); | |
} else if (big_endian) { | |
FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1], | |
crow[2]); | |
} else { | |
FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1], | |
crow[2]); | |
} | |
} else { | |
if (BitDepth::kInputBytes == 1) { | |
FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]); | |
} else if (big_endian) { | |
FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1], | |
crow[2], crow[3]); | |
} else { | |
FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1], | |
crow[2], crow[3]); | |
} | |
} | |
// Deal with x == 0. | |
for (size_t c = 0; c < nb_chans; c++) { | |
*(crow[c] - 1) = y > 0 ? *(prow[c]) : 0; | |
// Fix topleft. | |
*(prow[c] - 1) = y > 0 ? *(prow[c]) : 0; | |
} | |
if (y < yskip) | |
continue; | |
for (size_t c = 0; c < nb_chans; c++) { | |
// Get pointers to px/left/top/topleft data to speedup loop. | |
const pixel_t *row = crow[c]; | |
const pixel_t *row_left = crow[c] - 1; | |
const pixel_t *row_top = y == 0 ? row_left : prow[c]; | |
const pixel_t *row_topleft = y == 0 ? row_left : prow[c] - 1; | |
processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs); | |
} | |
} | |
for (size_t c = 0; c < nb_chans; c++) { | |
processors[c].Finalize(); | |
} | |
} | |
template <typename BitDepth> | |
void WriteACSection(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t ys, size_t row_stride, bool is_single_group, BitDepth bitdepth, size_t nb_chans, bool big_endian, const PrefixCode code[4], std::array<BitWriter, 4> &output) | |
{ | |
for (size_t i = 0; i < nb_chans; i++) { | |
if (is_single_group && i == 0) | |
continue; | |
output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4); | |
} | |
if (!is_single_group) { | |
// Group header for modular image. | |
// When the image is single-group, the global modular image is the one | |
// that contains the pixel data, and there is no group header. | |
output[0].Write(1, 1); // Global tree | |
output[0].Write(1, 1); // All default wp | |
output[0].Write(2, 0b00); // 0 transforms | |
} | |
ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4]; | |
for (size_t c = 0; c < nb_chans; c++) { | |
row_encoders[c].t = ChunkEncoder<BitDepth>(); | |
row_encoders[c].t.output = &output[c]; | |
row_encoders[c].t.code = &code[c]; | |
} | |
ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> >( | |
rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian, | |
row_encoders); | |
} | |
constexpr int kHashExp = 16; | |
constexpr uint32_t kHashSize = 1 << kHashExp; | |
constexpr uint32_t kHashMultiplier = 2654435761; | |
constexpr int kMaxColors = 512; | |
// can be any function that returns a value in 0 .. kHashSize-1 | |
// has to map 0 to 0 | |
inline uint32_t pixel_hash(uint32_t p) | |
{ | |
return (p * kHashMultiplier) >> (32 - kHashExp); | |
} | |
template <size_t nb_chans> | |
void FillRowPalette(const unsigned char *inrow, size_t xs, const int16_t *lookup, int16_t *out) | |
{ | |
for (size_t x = 0; x < xs; x++) { | |
uint32_t p = 0; | |
memcpy(&p, inrow + x * nb_chans, nb_chans); | |
out[x] = lookup[pixel_hash(p)]; | |
} | |
} | |
template <typename Processor> | |
void ProcessImageAreaPalette(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t yskip, | |
size_t ys, size_t row_stride, const int16_t *lookup, | |
size_t nb_chans, Processor *processors) | |
{ | |
constexpr size_t kPadding = 32; | |
std::vector<std::array<int16_t, 256 + kPadding * 2> > group_data(2); | |
Processor &row_encoder = processors[0]; | |
for (size_t y = 0; y < ys; y++) { | |
// Pre-fill rows with palette converted pixels. | |
const unsigned char *inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans; | |
int16_t *outrow = &group_data[y & 1][kPadding]; | |
if (nb_chans == 1) { | |
FillRowPalette<1>(inrow, xs, lookup, outrow); | |
} else if (nb_chans == 2) { | |
FillRowPalette<2>(inrow, xs, lookup, outrow); | |
} else if (nb_chans == 3) { | |
FillRowPalette<3>(inrow, xs, lookup, outrow); | |
} else if (nb_chans == 4) { | |
FillRowPalette<4>(inrow, xs, lookup, outrow); | |
} | |
// Deal with x == 0. | |
group_data[y & 1][kPadding - 1] = y > 0 ? group_data[(y - 1) & 1][kPadding] : 0; | |
// Fix topleft. | |
group_data[(y - 1) & 1][kPadding - 1] = y > 0 ? group_data[(y - 1) & 1][kPadding] : 0; | |
// Get pointers to px/left/top/topleft data to speedup loop. | |
const int16_t *row = &group_data[y & 1][kPadding]; | |
const int16_t *row_left = &group_data[y & 1][kPadding - 1]; | |
const int16_t *row_top = y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding]; | |
const int16_t *row_topleft = y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1]; | |
row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs); | |
} | |
row_encoder.Finalize(); | |
} | |
void WriteACSectionPalette(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t ys, size_t row_stride, bool is_single_group, const PrefixCode code[4], const int16_t *lookup, size_t nb_chans, BitWriter &output) | |
{ | |
if (!is_single_group) { | |
output.Allocate(16 * xs * ys + 4); | |
// Group header for modular image. | |
// When the image is single-group, the global modular image is the one | |
// that contains the pixel data, and there is no group header. | |
output.Write(1, 1); // Global tree | |
output.Write(1, 1); // All default wp | |
output.Write(2, 0b00); // 0 transforms | |
} | |
ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder; | |
row_encoder.t = ChunkEncoder<UpTo8Bits>(); | |
row_encoder.t.output = &output; | |
row_encoder.t.code = &code[is_single_group?1:0]; | |
ProcessImageAreaPalette< | |
ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>( | |
rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder); | |
} | |
template <typename BitDepth> | |
void CollectSamples(const unsigned char *rgba, size_t x0, size_t y0, size_t xs, size_t row_stride, | |
size_t row_count, uint64_t raw_counts[4][kNumRawSymbols], | |
uint64_t lz77_counts[4][kNumLZ77], bool is_single_group, | |
bool palette, BitDepth bitdepth, size_t nb_chans, | |
bool big_endian, const int16_t *lookup) | |
{ | |
if (palette) { | |
ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits> | |
row_sample_collectors[4]; | |
for (size_t c = 0; c < nb_chans; c++) { | |
row_sample_collectors[c].t = ChunkSampleCollector<UpTo8Bits>(); | |
row_sample_collectors[c].t.raw_counts = raw_counts[is_single_group ? 1 : 0]; | |
row_sample_collectors[c].t.lz77_counts = lz77_counts[is_single_group ? 1 : 0]; | |
} | |
ProcessImageAreaPalette< | |
ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits> >( | |
rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans, | |
row_sample_collectors); | |
} else { | |
ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth> | |
row_sample_collectors[4]; | |
for (size_t c = 0; c < nb_chans; c++) { | |
row_sample_collectors[c].t = ChunkSampleCollector<BitDepth>(); | |
row_sample_collectors[c].t.raw_counts = raw_counts[c]; | |
row_sample_collectors[c].t.lz77_counts = lz77_counts[c]; | |
} | |
ProcessImageArea< | |
ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth> >( | |
rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans, | |
big_endian, row_sample_collectors); | |
} | |
} | |
void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height, | |
const PrefixCode code[4], const std::vector<uint32_t> &palette, | |
size_t pcolors, BitWriter *output) | |
{ | |
PrepareDCGlobalCommon(is_single_group, width, height, code, output); | |
output->Write(2, 0b01); // 1 transform | |
output->Write(2, 0b01); // Palette | |
output->Write(5, 0b00000); // Starting from ch 0 | |
output->Write(2, 0b10); // 4-channel palette (RGBA) | |
// pcolors <= kMaxColors + kChunkSize - 1 | |
static_assert(kMaxColors + kChunkSize < 1281, | |
"add code to signal larger palette sizes"); | |
if (pcolors < 256) { | |
output->Write(2, 0b00); | |
output->Write(8, pcolors); | |
} else { | |
output->Write(2, 0b01); | |
output->Write(10, pcolors - 256); | |
} | |
output->Write(2, 0b00); // nb_deltas == 0 | |
output->Write(4, 0); // Zero predictor for delta palette | |
// Encode palette | |
ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder; | |
row_encoder.t = ChunkEncoder<UpTo8Bits>(); | |
row_encoder.t.output = output; | |
row_encoder.t.code = &code[0]; | |
int16_t p[4][32 + 1024] = {}; | |
uint8_t prgba[4]; | |
size_t i = 0; | |
size_t have_zero = 0; | |
if (palette[pcolors - 1] == 0) | |
have_zero = 1; | |
for (; i < pcolors; i++) { | |
memcpy(prgba, &palette[i], 4); | |
p[0][16 + i + have_zero] = prgba[0]; | |
p[1][16 + i + have_zero] = prgba[1]; | |
p[2][16 + i + have_zero] = prgba[2]; | |
p[3][16 + i + have_zero] = prgba[3]; | |
} | |
p[0][15] = 0; | |
row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors); | |
p[1][15] = p[0][16]; | |
p[0][15] = p[0][16]; | |
row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors); | |
p[2][15] = p[1][16]; | |
p[1][15] = p[1][16]; | |
row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors); | |
p[3][15] = p[2][16]; | |
p[2][15] = p[2][16]; | |
row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors); | |
row_encoder.Finalize(); | |
if (!is_single_group) { | |
output->ZeroPadToByte(); | |
} | |
} | |
template <typename BitDepth> | |
JxlFastLosslessFrameState *LLEnc(const unsigned char *rgba, size_t width, size_t stride, size_t height, | |
BitDepth bitdepth, size_t nb_chans, bool big_endian, | |
int effort, void *runner_opaque, FJxlParallelRunner runner) | |
{ | |
assert(width != 0); | |
assert(height != 0); | |
assert(stride >= nb_chans * BitDepth::kInputBytes * width); | |
// Count colors to try palette | |
std::vector<uint32_t> palette(kHashSize); | |
palette[0] = 1; | |
std::vector<int16_t> lookup(kHashSize); | |
lookup[0] = 0; | |
int pcolors = 0; | |
bool collided = effort < 2 || bitdepth.bitdepth != 8 || nb_chans < 4; | |
// todo: also do rgb palette | |
for (size_t y = 0; y < height && !collided; y++) { | |
const unsigned char *r = rgba + stride * y; | |
size_t x = 0; | |
if (nb_chans == 4) { | |
// this is just an unrolling of the next loop | |
for (; x + 7 < width; x += 8) { | |
uint32_t p[8], index[8]; | |
memcpy(p, r + x * 4, 32); | |
for (int i = 0; i < 8; i++) | |
index[i] = pixel_hash(p[i]); | |
for (int i = 0; i < 8; i++) { | |
uint32_t init_entry = index[i] ? 0 : 1; | |
if (init_entry != palette[index[i]] && p[i] != palette[index[i]]) { | |
collided = true; | |
} | |
} | |
for (int i = 0; i < 8; i++) | |
palette[index[i]] = p[i]; | |
} | |
for (; x < width; x++) { | |
uint32_t p; | |
memcpy(&p, r + x * 4, 4); | |
uint32_t index = pixel_hash(p); | |
uint32_t init_entry = index ? 0 : 1; | |
if (init_entry != palette[index] && p != palette[index]) { | |
collided = true; | |
} | |
palette[index] = p; | |
} | |
} else { | |
for (; x < width; x++) { | |
uint32_t p = 0; | |
memcpy(&p, r + x * nb_chans, nb_chans); | |
uint32_t index = pixel_hash(p); | |
uint32_t init_entry = index ? 0 : 1; | |
if (init_entry != palette[index] && p != palette[index]) { | |
collided = true; | |
} | |
palette[index] = p; | |
} | |
} | |
} | |
int nb_entries = 0; | |
if (!collided) { | |
if (palette[0] == 0) | |
pcolors = 1; | |
if (palette[0] == 1) | |
palette[0] = 0; | |
bool have_color = false; | |
uint8_t minG = 255, maxG = 0; | |
for (uint32_t k = 0; k < kHashSize; k++) { | |
if (palette[k] == 0) | |
continue; | |
uint8_t p[4]; | |
memcpy(p, &palette[k], 4); | |
// move entries to front so sort has less work | |
palette[nb_entries] = palette[k]; | |
if (p[0] != p[1] || p[0] != p[2]) | |
have_color = true; | |
if (p[1] < minG) | |
minG = p[1]; | |
if (p[1] > maxG) | |
maxG = p[1]; | |
nb_entries++; | |
// don't do palette if too many colors are needed | |
if (nb_entries + pcolors > kMaxColors) { | |
collided = true; | |
break; | |
} | |
} | |
if (!have_color) { | |
// don't do palette if it's just grayscale without many holes | |
if (maxG - minG < nb_entries * 1.4f) | |
collided = true; | |
} | |
} | |
if (!collided) { | |
std::sort( | |
palette.begin(), palette.begin() + nb_entries, | |
[](uint32_t ap, uint32_t bp) { | |
if (ap == 0) | |
return false; | |
if (bp == 0) | |
return true; | |
uint8_t a[4], b[4]; | |
memcpy(a, &ap, 4); | |
memcpy(b, &bp, 4); | |
float ay, by; | |
ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3]; | |
by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3]; | |
return ay < by; // sort on alpha*luma | |
}); | |
for (int k = 0; k < nb_entries; k++) { | |
if (palette[k] == 0) | |
break; | |
lookup[pixel_hash(palette[k])] = pcolors++; | |
} | |
} | |
size_t num_groups_x = (width + 255) / 256; | |
size_t num_groups_y = (height + 255) / 256; | |
size_t num_dc_groups_x = (width + 2047) / 2048; | |
size_t num_dc_groups_y = (height + 2047) / 2048; | |
uint64_t raw_counts[4][kNumRawSymbols] = {}; | |
uint64_t lz77_counts[4][kNumLZ77] = {}; | |
bool onegroup = num_groups_x == 1 && num_groups_y == 1; | |
// sample the middle (effort * 2) rows of every group | |
for (size_t g = 0; g < num_groups_y * num_groups_x; g++) { | |
size_t xg = g % num_groups_x; | |
size_t yg = g / num_groups_x; | |
int y_offset = yg * 256; | |
int y_max = std::min<size_t>(height - yg * 256, 256); | |
int y_begin = y_offset + std::max<int>(0, y_max - 2 * effort) / 2; | |
int y_count = std::min<int>(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1); | |
int x_max = std::min<size_t>(width - xg * 256, 256) / kChunkSize * kChunkSize; | |
CollectSamples(rgba, xg * 256, y_begin, x_max, stride, y_count, raw_counts, | |
lz77_counts, onegroup, !collided, bitdepth, nb_chans, | |
big_endian, lookup.data()); | |
} | |
// TODO(veluca): can probably improve this and make it bitdepth-dependent. | |
uint64_t base_raw_counts[kNumRawSymbols] = { | |
3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51, | |
5, 1, 1, 1, 1, 1, 1, 1, 1}; | |
bool doing_ycocg = nb_chans > 2 && collided; | |
for (size_t i = bitdepth.NumSymbols(doing_ycocg); i < kNumRawSymbols; i++) { | |
base_raw_counts[i] = 0; | |
} | |
for (size_t c = 0; c < 4; c++) { | |
for (size_t i = 0; i < kNumRawSymbols; i++) { | |
raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i]; | |
} | |
} | |
if (!collided) { | |
unsigned token, nbits, bits; | |
EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits); | |
// ensure all palette indices can actually be encoded | |
for (size_t i = 0; i < token + 1; i++) | |
raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1); | |
// these tokens are only used for the palette itself so they can get a bad | |
// code | |
for (size_t i = token + 1; i < 10; i++) | |
raw_counts[0][i] = 1; | |
} | |
uint64_t base_lz77_counts[kNumLZ77] = { | |
29, 27, 25, 23, 21, 21, 19, 18, 21, | |
17, 16, 15, 15, 14, 13, 13, 137, 98, | |
61, 34, 1, 1, 1, 1, 1, 1, 1, | |
1}; | |
for (auto &lz77_count : lz77_counts) { | |
for (size_t i = 0; i < kNumLZ77; i++) { | |
lz77_count[i] = (lz77_count[i] << 8) + base_lz77_counts[i]; | |
} | |
} | |
alignas(64) PrefixCode hcode[4]; | |
for (size_t i = 0; i < 4; i++) { | |
hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]); | |
} | |
size_t num_groups = onegroup ? 1 : (2 + num_dc_groups_x * num_dc_groups_y + num_groups_x * num_groups_y); | |
JxlFastLosslessFrameState *frame_state = new JxlFastLosslessFrameState(); | |
frame_state->width = width; | |
frame_state->height = height; | |
frame_state->nb_chans = nb_chans; | |
frame_state->bitdepth = bitdepth.bitdepth; | |
frame_state->group_data = std::vector<std::array<BitWriter, 4> >(num_groups); | |
if (collided) { | |
PrepareDCGlobal(onegroup, width, height, nb_chans, hcode, | |
&frame_state->group_data[0][0]); | |
} else { | |
PrepareDCGlobalPalette(onegroup, width, height, hcode, palette, pcolors, | |
&frame_state->group_data[0][0]); | |
} | |
auto run_one = [&](size_t g) { | |
size_t xg = g % num_groups_x; | |
size_t yg = g / num_groups_x; | |
size_t group_id = onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g); | |
size_t xs = std::min<size_t>(width - xg * 256, 256); | |
size_t ys = std::min<size_t>(height - yg * 256, 256); | |
size_t x0 = xg * 256; | |
size_t y0 = yg * 256; | |
auto &gd = frame_state->group_data[group_id]; | |
if (collided) { | |
WriteACSection(rgba, x0, y0, xs, ys, stride, onegroup, bitdepth, nb_chans, | |
big_endian, hcode, gd); | |
} else { | |
WriteACSectionPalette(rgba, x0, y0, xs, ys, stride, onegroup, hcode, | |
lookup.data(), nb_chans, gd[0]); | |
} | |
}; | |
runner( | |
runner_opaque, &run_one, | |
+[](void *r, size_t i) { (*reinterpret_cast<decltype(&run_one)>(r))(i); }, | |
num_groups_x * num_groups_y); | |
return frame_state; | |
} | |
JxlFastLosslessFrameState *JxlFastLosslessEncodeImpl( | |
const unsigned char *rgba, size_t width, size_t stride, size_t height, size_t nb_chans, | |
size_t bitdepth, bool big_endian, int effort, void *runner_opaque, FJxlParallelRunner runner) | |
{ | |
assert(bitdepth > 0); | |
assert(nb_chans <= 4); | |
assert(nb_chans != 0); | |
if (bitdepth <= 8) { | |
return LLEnc(rgba, width, stride, height, UpTo8Bits(bitdepth), nb_chans, | |
big_endian, effort, runner_opaque, runner); | |
} | |
if (bitdepth <= 13) { | |
return LLEnc(rgba, width, stride, height, From9To13Bits(bitdepth), nb_chans, | |
big_endian, effort, runner_opaque, runner); | |
} | |
if (bitdepth == 14) { | |
return LLEnc(rgba, width, stride, height, Exactly14Bits(bitdepth), nb_chans, | |
big_endian, effort, runner_opaque, runner); | |
} | |
return LLEnc(rgba, width, stride, height, MoreThan14Bits(bitdepth), nb_chans, | |
big_endian, effort, runner_opaque, runner); | |
} | |
} // namespace | |
#endif // FJXL_SELF_INCLUDE | |
extern "C" { | |
size_t JxlFastLosslessEncode(const unsigned char *rgba, size_t width, size_t row_stride, size_t height, | |
size_t nb_chans, size_t bitdepth, int big_endian, int effort, | |
unsigned char **output, void *runner_opaque, FJxlParallelRunner runner) | |
{ | |
auto frame_state = JxlFastLosslessPrepareFrame( | |
rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort, | |
runner_opaque, runner); | |
JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1, | |
/*is_last=*/1); | |
size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state); | |
*output = (unsigned char *)malloc(output_size); | |
size_t written = 0; | |
size_t total = 0; | |
while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total, | |
output_size - total)) | |
!= 0) { | |
total += written; | |
} | |
return total; | |
} | |
JxlFastLosslessFrameState *JxlFastLosslessPrepareFrame( | |
const unsigned char *rgba, size_t width, size_t row_stride, size_t height, size_t nb_chans, size_t bitdepth, | |
int big_endian, int effort, void *runner_opaque, FJxlParallelRunner runner) | |
{ | |
auto trivial_runner = +[](void *, void *opaque, void fun(void *, size_t), size_t count) { | |
for (size_t i = 0; i < count; i++) { | |
fun(opaque, i); | |
} | |
}; | |
if (runner == nullptr) { | |
runner = trivial_runner; | |
} | |
return JxlFastLosslessEncodeImpl( | |
rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort, | |
runner_opaque, runner); | |
} | |
} // extern "C" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment