bqqbarbhg/bc3_eac.c

## bc3_eac.c
#define _CRT_SECURE_NO_WARNINGS

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include "stb_image_resize.h"

#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
#include <assert.h>

#define ArraySize(arr) (sizeof(arr)/sizeof(*(arr)))

const int8_t etc2_alpha_modifiers[][8] = {
	{ -3, -6,  -9, -15, 2, 5, 8, 14, },
	{ -3, -7, -10, -13, 2, 6, 9, 12, },
	{ -2, -5,  -8, -13, 1, 4, 7, 12, },
	{ -2, -4,  -6, -13, 1, 3, 5, 12, },
	{ -3, -6,  -8, -12, 2, 5, 7, 11, },
	{ -3, -7,  -9, -11, 2, 6, 8, 10, },
	{ -4, -7,  -8, -11, 3, 6, 7, 10, },
	{ -3, -5,  -8, -11, 2, 4, 7, 10, },
	{ -2, -6,  -8, -10, 1, 5, 7,  9, },
	{ -2, -5,  -8, -10, 1, 4, 7,  9, },
	{ -2, -4,  -8, -10, 1, 3, 7,  9, },
	{ -2, -5,  -7, -10, 1, 4, 6,  9, },
	{ -3, -4,  -7, -10, 2, 3, 6,  9, },
	{ -1, -2,  -3, -10, 0, 1, 2,  9, },
	{ -4, -6,  -8,  -9, 3, 5, 7,  8, },
	{ -3, -5,  -7,  -9, 2, 4, 6,  8, },
};

inline int32_t ClampU11(int32_t value)
{
	if (value < 0) return 0;
	if (value > 2047) return 2047;
	return value;
}

inline uint32_t MinU32(uint32_t a, uint32_t b)
{
	return a < b ? a : b;
}

inline uint32_t MaxU32(uint32_t a, uint32_t b)
{
	return a < b ? b : a;
}

inline int32_t EacExpandMultiplier(int32_t multiplier)
{
	return multiplier ? multiplier * 8 : 1;
}

typedef struct eac_mode_dec {
	int32_t base;
	const int16_t *table;
} eac_mode_dec;

typedef struct eac_tables {
	int16_t table_multiplier[16][16][8];
} eac_tables;

void EacInitTables(eac_tables *tables)
{
	for (uint32_t table_index = 0; table_index < 16; table_index++) {
		const int8_t *src = etc2_alpha_modifiers[table_index];
		for (uint32_t multiplier = 0; multiplier < 16; multiplier++) {
			int32_t multiplier_value = EacExpandMultiplier(multiplier);

			int16_t *dst = tables->table_multiplier[table_index][multiplier];
			for (uint32_t i = 0; i < 8; i++) {
				dst[i] = (int16_t)(src[i] * multiplier_value);
			}
		}
	}
}

inline eac_mode_dec EacDecodeInit(const eac_tables *tables, uint32_t base_codeword, uint32_t table_index, uint32_t multiplier)
{
	eac_mode_dec dec;
	dec.base = (int32_t)base_codeword * 8 + 4;
	dec.table = tables->table_multiplier[table_index][multiplier];
	return dec;
}

// Decode an EAC value, result in [0, 2047]
inline int32_t EacDecodeU11(eac_mode_dec mode, int32_t index)
{
	return ClampU11(mode.base + mode.table[index]);
}

inline int32_t AbsI32(int32_t a)
{
	return a >= 0 ? a : -a;
}

uint64_t EacCompressFit(eac_mode_dec mode, const int32_t *block_u11, int32_t *p_error)
{
	uint64_t bits = 0;
	int32_t total_err = 0;
	int32_t max_err = *p_error;
	for (uint32_t px = 0; px < 16; px++) {
		int32_t best_err = INT32_MAX;
		uint32_t best_ti = 0;
		int32_t ref = block_u11[px];

		for (uint32_t ti = 0; ti < 8; ti++) {
			int32_t err = AbsI32(ref - EacDecodeU11(mode, ti));
			if (err < best_err) {
				best_err = err;
				best_ti = ti;
			}
		}

		bits |= (uint64_t)best_ti << ((15 - px) * 3);
		total_err += best_err;
		if (total_err >= max_err) return 0;
	}
	*p_error = total_err;
	return bits;
}

int32_t EacErrorToCompressed(const eac_tables *tables, uint64_t packed, const int32_t *block_u11)
{
	uint32_t base_codeword = (uint32_t)(packed >> 56) & 0xff;
	uint32_t multiplier = (uint32_t)(packed >> 52) & 0xf;
	uint32_t table_index = (uint32_t)(packed >> 48) & 0xf;
	eac_mode_dec mode = EacDecodeInit(tables, base_codeword, table_index, multiplier);

	int32_t total_err = 0;
	for (uint32_t px = 0; px < 16; px++) {
		int32_t ref = block_u11[px];

		uint32_t ti = (packed >> ((15 - px) * 3)) & 0x7;
		total_err += AbsI32(ref - EacDecodeU11(mode, ti));
	}
	return total_err;
}

uint64_t EacCompressSimple(const eac_tables *tables, const uint16_t *src, int32_t *p_error)
{
	int32_t block_u11[16];

	uint32_t min_v = UINT32_MAX, max_v = 0;
	for (uint32_t px = 0; px < 16; px++) {
		min_v = MinU32(min_v, src[px]);
		max_v = MaxU32(max_v, src[px]);
		block_u11[px] = (int32_t)(src[px] >> 5);
	}

	uint32_t base_codeword = (min_v + max_v) / 2 >> 8;

	int32_t best_error = *p_error;
	uint64_t best_bits = 0;

	for (uint32_t table_index = 0; table_index < 16; table_index++) {
		for (uint32_t multiplier = 0; multiplier < 16; multiplier++) {
			eac_mode_dec mode = EacDecodeInit(tables, base_codeword, table_index, multiplier);
			int32_t err = best_error;
			uint64_t pixel_bits = EacCompressFit(mode, block_u11, &err);
			if (err < best_error) {
				best_error = err;
				best_bits = pixel_bits
					| (uint64_t)base_codeword << 56
					| (uint64_t)multiplier << 52
					| (uint64_t)table_index << 48;
			}
		}
	}

	*p_error = best_error;
	return best_bits;
}

typedef struct image {
	uint16_t *pixels;
	uint32_t stride, channels;
	uint32_t width, height;
} image;

uint16_t ImageGetU16(const image *img, uint32_t x, uint32_t y, uint32_t c)
{
	x = MinU32(x, img->width - 1);
	y = MinU32(y, img->height - 1);
	return img->pixels[(y * img->stride + x) * img->channels + c];
}

void EacCompressBlock(const eac_tables *tables, void *dst, const image *img, uint32_t block_x, uint32_t block_y, uint32_t channel)
{
	uint16_t block[16];
	uint32_t base_x = block_x * 4;
	uint32_t base_y = block_y * 4;

	// ETC block layout is vertical so need to load transposed
	for (uint32_t y = 0; y < 4; y++) {
		for (uint32_t x = 0; x < 4; x++) {
			block[x * 4 + y] = ImageGetU16(img, base_x + x, base_y + y, channel);
		}
	}

	int32_t error = INT32_MAX;
	uint64_t result = EacCompressSimple(tables, block, &error);

	char *dst_p = (char*)dst;
	for (uint32_t i = 0; i < 8; i++) {
		dst_p[i] = (char)(result >> (56 - i * 8));
	}
}

int CompareU16(const void *va, const void *vb)
{
	const uint16_t a = *(const uint16_t*)va, b = *(const uint16_t*)vb;
	if (a != b) return a < b ? -1 : 1;
	return 0;
}

uint64_t BC4CompressFit(const uint16_t *src, int32_t lo, int32_t hi, bool swap, int32_t *p_error)
{
	if ((hi >> 8) == (lo >> 8)) {
		if (hi < 0x8000) {
			hi += 0x100;
		} else {
			lo -= 0x100;
		}
	}

	int32_t values[8];
	if (swap) {
		values[0] = lo;
		values[1] = hi;
		values[2] = (4*lo + 1*hi) / 5;
		values[3] = (3*lo + 2*hi) / 5;
		values[4] = (2*lo + 3*hi) / 5;
		values[5] = (1*lo + 4*hi) / 5;
		values[6] = 0;
		values[7] = 0xffff;
	} else {
		values[0] = hi;
		values[1] = lo;
		values[2] = (6*hi + 1*lo) / 7;
		values[3] = (5*hi + 2*lo) / 7;
		values[4] = (4*hi + 3*lo) / 7;
		values[5] = (3*hi + 4*lo) / 7;
		values[6] = (2*hi + 5*lo) / 7;
		values[7] = (1*hi + 6*lo) / 7;
	}

	uint64_t bits = 0;
	int32_t total_err = 0;
	int32_t max_err = *p_error;
	for (uint32_t px = 0; px < 16; px++) {
		int32_t best_err = INT32_MAX;
		uint32_t best_ti = 0;
		int32_t ref = src[px];

		for (uint32_t ti = 0; ti < 8; ti++) {
			int32_t err = AbsI32(ref - values[ti]);
			if (err < best_err) {
				best_err = err;
				best_ti = ti;
			}
		}

		bits |= (uint64_t)best_ti << (px * 3);
		total_err += best_err;
		if (total_err >= max_err) return 0;
	}

	uint32_t a = values[0] >> 8;
	uint32_t b = values[1] >> 8;

	*p_error = total_err;
	return (bits << 16) | (b << 8) | (a);
}

uint64_t BC4CompressSimple(const uint16_t *src, int32_t *p_error)
{
	uint16_t sorted[16];
	memcpy(sorted, src, 16 * sizeof(uint16_t));
	qsort(sorted, 16, sizeof(uint16_t), &CompareU16);

	int32_t best_error = *p_error;
	uint64_t best_bits = 0;

	for (uint32_t drop_lo = 0; drop_lo < 15; drop_lo++) {
		for (uint32_t drop_hi = 0; drop_hi < 15 - drop_lo; drop_hi++) {
			int32_t lo = sorted[drop_lo];
			int32_t hi = sorted[15 - drop_hi];

			for (uint32_t swap = 0; swap < 2; swap++) {
				int32_t err = best_error;
				uint64_t bits = BC4CompressFit(src, lo, hi, swap != 0, &err);
				if (err < best_error) {
					best_error = err;
					best_bits = bits;
				}
			}
		}
	}

	*p_error = best_error;
	return best_bits;
}

void BC4CompressBlock(void *dst, const image *img, uint32_t block_x, uint32_t block_y, uint32_t channel)
{
	uint16_t block[16];
	uint32_t base_x = block_x * 4;
	uint32_t base_y = block_y * 4;

	for (uint32_t y = 0; y < 4; y++) {
		for (uint32_t x = 0; x < 4; x++) {
			block[y * 4 + x] = ImageGetU16(img, base_x + x, base_y + y, channel);
		}
	}

	int32_t error = INT32_MAX;
	uint64_t result = BC4CompressSimple(block, &error);

	char *dst_p = (char*)dst;
	for (uint32_t i = 0; i < 8; i++) {
		dst_p[i] = (char)(result >> (i * 8));
	}
}

image LoadImage(const char *path, uint32_t req_channels)
{
	image img = { NULL };

	int width, height, channels;
	uint16_t *pixels = stbi_load_16(path, &width, &height, &channels, (int)req_channels);
	if (!pixels) return img;

	img.pixels = pixels;
	img.width = width;
	img.height = height;
	img.stride = width * channels;
	img.channels = channels;
	return img;
}

void CrunchImage(image *img, double scale, double bias)
{
	uint32_t count = img->width * img->height * img->channels;
	for (uint32_t i = 0; i < count; i++) {
		double value = (double)img->pixels[i];
		double v = value * scale + bias;
		if (v < 0.0) v = 0.0;
		if (v > 65535.0) v = 65535.0;
		img->pixels[i] = (uint16_t)v;
	}
}

image ResizeImage(const image *img, uint32_t width, uint32_t height, stbir_filter filter)
{
	image res;
	res.pixels = (uint16_t*)malloc(width * height * img->channels * sizeof(uint16_t));
	res.width = width;
	res.height = height;
	res.channels = img->channels;
	res.stride = res.width * res.channels;

	stbir_resize_uint16_generic(
		img->pixels, (int)img->width, (int)img->height, (int)img->stride * sizeof(uint16_t),
		res.pixels, (int)res.width, (int)res.height, (int)res.stride * sizeof(uint16_t),
		(int)res.channels, STBIR_ALPHA_CHANNEL_NONE, 0, STBIR_EDGE_CLAMP, filter,
		STBIR_COLORSPACE_SRGB, NULL);

	return res;
}

image PadImage(const image *img, uint32_t width, uint32_t height)
{
	uint32_t channels = img->channels;

	image res;
	res.width = width;
	res.height = height;
	res.channels = channels;
	res.stride = res.width * res.channels;
	res.pixels = (uint16_t*)malloc(width * height * channels * sizeof(uint16_t));

	for (uint32_t y = 0; y < height; y++) {
		for (uint32_t x = 0; x < width; x++) {
			uint16_t *dst = res.pixels + y * res.stride + x * channels;
			for (uint32_t c = 0; c < channels; c++) {
				dst[c] = ImageGetU16(img, x, y, c);
			}
		}
	}

	return res;
}

void FreeImage(image *img)
{
	free(img->pixels);
	memset(img, 0, sizeof(image));
}

void WriteData(FILE *f, const void *data, size_t size)
{
	if (size == 0) return;
	fwrite(data, 1, size, f);
}

void WriteU32(FILE *f, uint32_t v)
{
	uint8_t bytes[4];
	for (uint32_t i = 0; i < 4; i++)
		bytes[i] = v >> (i * 8);
	WriteData(f, bytes, 4);
}

typedef struct ktx_mip {
	const uint8_t *data;
	uint32_t size;
} ktx_mip;

typedef struct ktx_key_value {
	const char *key;
	const void *value;
	uint32_t value_size;
} ktx_key_value;

typedef struct ktx_header {
	uint32_t gl_type;
	uint32_t gl_type_size;
	uint32_t gl_format;
	uint32_t gl_internal_format;
	uint32_t gl_base_internal_format;
	uint32_t pixel_width;
	uint32_t pixel_height;
	uint32_t pixel_depth;
	uint32_t layer_count;
	uint32_t face_count;
	uint32_t level_count;

	const ktx_key_value *key_values;
	uint32_t key_value_count;
} ktx_header;

void WriteKTX(FILE *f, const ktx_header *header, const ktx_mip *mips, size_t num_mips)
{
	const uint8_t magic[12] = {
		0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A,
	};
	WriteData(f, magic, 12);
	WriteU32(f, 0x04030201); // endianness
	WriteU32(f, header->gl_type); // glType
	WriteU32(f, header->gl_type_size); // glTypeSize
	WriteU32(f, header->gl_format); // glFormat
	WriteU32(f, header->gl_internal_format); // glInternalFormat: COMPRESSED_R11_EAC
	WriteU32(f, header->gl_base_internal_format); // glBaseInternalFormat: GL_RED
	WriteU32(f, header->pixel_width); // pixelWidth
	WriteU32(f, header->pixel_height); // pixelHeight
	WriteU32(f, header->pixel_depth); // pixelDepth
	WriteU32(f, header->layer_count); // layerCount
	WriteU32(f, header->face_count); // faceCount
	WriteU32(f, header->level_count); // levelCount

	uint32_t key_value_size = 0;
	for (uint32_t i = 0; i < header->key_value_count; i++) {
		const ktx_key_value *kv = &header->key_values[i];
		uint32_t key_size = (uint32_t)strlen(kv->key) + 1;
		key_value_size += key_size;
		key_value_size += kv->value_size;
		key_value_size += (4 - (key_size + kv->value_size) % 4) % 4;
	}
	WriteU32(f, key_value_size);
	for (uint32_t i = 0; i < header->key_value_count; i++) {
		const ktx_key_value *kv = &header->key_values[i];
		uint32_t key_size = (uint32_t)strlen(kv->key) + 1;
		WriteU32(f, key_size + kv->value_size);
		WriteData(f, kv->key, key_size);
		WriteData(f, kv->value, kv->value_size);
		WriteData(f, "\0\0\0", (4 - (key_size + kv->value_size) % 4) % 4);
	}

	assert(num_mips == MaxU32(header->layer_count, 1) * header->face_count * header->level_count);
	for (uint32_t i = 0; i < num_mips; i++) {
		WriteU32(f, mips[i].size);
		WriteData(f, mips[i].data, mips[i].size);
	}
}

bool SaveKTX(const char *path, const ktx_header *header, const ktx_mip *mips, size_t num_mips)
{
	FILE *f = fopen(path, "wb");
	if (!f) return false;
	WriteKTX(f, header, mips, num_mips);
	return fclose(f) == 0;
}

int main(int argc, char **argv)
{
	uint32_t mip_count = 3;

	image original_img = LoadImage(argv[1], 1);

	uint32_t alignment = 4 << (mip_count - 1);

	uint32_t original_width = original_img.width;
	uint32_t original_height = original_img.height;

	uint32_t top_width = (original_width + alignment - 1) & ~(alignment - 1);
	uint32_t top_height = (original_height + alignment - 1) & ~(alignment - 1);

	image top_img = PadImage(&original_img, top_width, top_height);
	FreeImage(&original_img);

	ktx_mip mips_eac[16];
	ktx_mip mips_bc4[16];

	eac_tables tables;
	EacInitTables(&tables);

	for (uint32_t mip_ix = 0; mip_ix < mip_count; mip_ix++) {

		image img;
		if (mip_ix == 0) {
			img = top_img;
		} else {
			uint32_t src_width = ((top_width + (1 << mip_ix)/2) >> mip_ix);
			uint32_t src_height = ((top_height + (1 << mip_ix)/2) >> mip_ix);
			img = ResizeImage(&top_img, src_width, src_height, STBIR_FILTER_CATMULLROM);
		}

		uint32_t mip_width = top_width >> mip_ix;
		uint32_t mip_height = top_height >> mip_ix;
		assert(mip_width % 4 == 0);
		assert(mip_height % 4 == 0);
		uint32_t blocks_x = mip_width / 4;
		uint32_t blocks_y = mip_height / 4;
		uint32_t block_size = 8;

		uint8_t *result_eac = calloc(block_size, blocks_x * blocks_y);
		uint8_t *result_bc4 = calloc(block_size, blocks_x * blocks_y);

		for (uint32_t y = 0; y < blocks_y; y++) {
			for (uint32_t x = 0; x < blocks_x; x++) {
				{
					uint8_t *dst = result_eac + y*blocks_x*block_size + x*block_size;
					EacCompressBlock(&tables, dst, &img, x, y, 0);
				}
				{
					uint8_t *dst = result_bc4 + y*blocks_x*block_size + x*block_size;
					BC4CompressBlock(dst, &img, x, y, 0);
				}
			}
			printf("%u/%u\n", y+1, blocks_y);
		}

		mips_eac[mip_ix].data = result_eac;
		mips_eac[mip_ix].size = blocks_x * blocks_y * block_size;

		mips_bc4[mip_ix].data = result_bc4;
		mips_bc4[mip_ix].size = blocks_x * blocks_y * block_size;

		if (mip_ix > 0) {
			FreeImage(&img);
		}
	}

	FreeImage(&top_img);

	ktx_header ktx_base = { 0 };
	ktx_base.gl_type = 0;
	ktx_base.gl_type_size = 1;
	ktx_base.gl_format = 0;
	ktx_base.pixel_width = top_width;
	ktx_base.pixel_height = top_height;
	ktx_base.pixel_depth = 0;
	ktx_base.layer_count = 0;
	ktx_base.face_count = 1;
	ktx_base.level_count = mip_count;

	char mango_json[512];
	snprintf(mango_json, sizeof(mango_json),
		"{ \"originalSize\": { \"x\": %u, \"y\": %u } }",
		original_width, original_height);

	ktx_key_value key_values[] = {
		{ "mango:json", mango_json, strlen(mango_json) + 1 },
	};
	ktx_base.key_values = key_values;
	ktx_base.key_value_count = ArraySize(key_values);

	{
		ktx_header ktx_head = ktx_base;
		ktx_head.gl_internal_format = 0x9270; // COMPRESSED_R11_EAC
		ktx_head.gl_base_internal_format = 0x1903; // GL_RED
		SaveKTX("test_eac.ktx", &ktx_head, mips_eac, mip_count);
	}

	{
		ktx_header ktx_head = ktx_base;
		ktx_head.gl_internal_format = 0x8DBB; // GL_COMPRESSED_RED_RGTC1_EXT
		ktx_head.gl_base_internal_format = 0x1903; // GL_RED
		SaveKTX("test_bc4.ktx", &ktx_head, mips_bc4, mip_count);
	}

	return 0;
}
	#define _CRT_SECURE_NO_WARNINGS

	#define STB_IMAGE_IMPLEMENTATION
	#include "stb_image.h"

	#define STB_IMAGE_RESIZE_IMPLEMENTATION
	#include "stb_image_resize.h"

	#include <stdint.h>
	#include <stdio.h>
	#include <stdbool.h>
	#include <assert.h>

	#define ArraySize(arr) (sizeof(arr)/sizeof(*(arr)))

	const int8_t etc2_alpha_modifiers[][8] = {
	{ -3, -6, -9, -15, 2, 5, 8, 14, },
	{ -3, -7, -10, -13, 2, 6, 9, 12, },
	{ -2, -5, -8, -13, 1, 4, 7, 12, },
	{ -2, -4, -6, -13, 1, 3, 5, 12, },
	{ -3, -6, -8, -12, 2, 5, 7, 11, },
	{ -3, -7, -9, -11, 2, 6, 8, 10, },
	{ -4, -7, -8, -11, 3, 6, 7, 10, },
	{ -3, -5, -8, -11, 2, 4, 7, 10, },
	{ -2, -6, -8, -10, 1, 5, 7, 9, },
	{ -2, -5, -8, -10, 1, 4, 7, 9, },
	{ -2, -4, -8, -10, 1, 3, 7, 9, },
	{ -2, -5, -7, -10, 1, 4, 6, 9, },
	{ -3, -4, -7, -10, 2, 3, 6, 9, },
	{ -1, -2, -3, -10, 0, 1, 2, 9, },
	{ -4, -6, -8, -9, 3, 5, 7, 8, },
	{ -3, -5, -7, -9, 2, 4, 6, 8, },
	};

	inline int32_t ClampU11(int32_t value)
	{
	if (value < 0) return 0;
	if (value > 2047) return 2047;
	return value;
	}

	inline uint32_t MinU32(uint32_t a, uint32_t b)
	{
	return a < b ? a : b;
	}

	inline uint32_t MaxU32(uint32_t a, uint32_t b)
	{
	return a < b ? b : a;
	}

	inline int32_t EacExpandMultiplier(int32_t multiplier)
	{
	return multiplier ? multiplier * 8 : 1;
	}

	typedef struct eac_mode_dec {
	int32_t base;
	const int16_t *table;
	} eac_mode_dec;

	typedef struct eac_tables {
	int16_t table_multiplier[16][16][8];
	} eac_tables;

	void EacInitTables(eac_tables *tables)
	{
	for (uint32_t table_index = 0; table_index < 16; table_index++) {
	const int8_t *src = etc2_alpha_modifiers[table_index];
	for (uint32_t multiplier = 0; multiplier < 16; multiplier++) {
	int32_t multiplier_value = EacExpandMultiplier(multiplier);

	int16_t *dst = tables->table_multiplier[table_index][multiplier];
	for (uint32_t i = 0; i < 8; i++) {
	dst[i] = (int16_t)(src[i] * multiplier_value);
	}
	}
	}
	}

	inline eac_mode_dec EacDecodeInit(const eac_tables *tables, uint32_t base_codeword, uint32_t table_index, uint32_t multiplier)
	{
	eac_mode_dec dec;
	dec.base = (int32_t)base_codeword * 8 + 4;
	dec.table = tables->table_multiplier[table_index][multiplier];
	return dec;
	}

	// Decode an EAC value, result in [0, 2047]
	inline int32_t EacDecodeU11(eac_mode_dec mode, int32_t index)
	{
	return ClampU11(mode.base + mode.table[index]);
	}

	inline int32_t AbsI32(int32_t a)
	{
	return a >= 0 ? a : -a;
	}

	uint64_t EacCompressFit(eac_mode_dec mode, const int32_t block_u11, int32_t p_error)
	{
	uint64_t bits = 0;
	int32_t total_err = 0;
	int32_t max_err = *p_error;
	for (uint32_t px = 0; px < 16; px++) {
	int32_t best_err = INT32_MAX;
	uint32_t best_ti = 0;
	int32_t ref = block_u11[px];

	for (uint32_t ti = 0; ti < 8; ti++) {
	int32_t err = AbsI32(ref - EacDecodeU11(mode, ti));
	if (err < best_err) {
	best_err = err;
	best_ti = ti;
	}
	}

	bits \|= (uint64_t)best_ti << ((15 - px) * 3);
	total_err += best_err;
	if (total_err >= max_err) return 0;
	}
	*p_error = total_err;
	return bits;
	}

	int32_t EacErrorToCompressed(const eac_tables tables, uint64_t packed, const int32_t block_u11)
	{
	uint32_t base_codeword = (uint32_t)(packed >> 56) & 0xff;
	uint32_t multiplier = (uint32_t)(packed >> 52) & 0xf;
	uint32_t table_index = (uint32_t)(packed >> 48) & 0xf;
	eac_mode_dec mode = EacDecodeInit(tables, base_codeword, table_index, multiplier);

	int32_t total_err = 0;
	for (uint32_t px = 0; px < 16; px++) {
	int32_t ref = block_u11[px];

	uint32_t ti = (packed >> ((15 - px) * 3)) & 0x7;
	total_err += AbsI32(ref - EacDecodeU11(mode, ti));
	}
	return total_err;
	}

	uint64_t EacCompressSimple(const eac_tables tables, const uint16_t src, int32_t *p_error)
	{
	int32_t block_u11[16];

	uint32_t min_v = UINT32_MAX, max_v = 0;
	for (uint32_t px = 0; px < 16; px++) {
	min_v = MinU32(min_v, src[px]);
	max_v = MaxU32(max_v, src[px]);
	block_u11[px] = (int32_t)(src[px] >> 5);
	}

	uint32_t base_codeword = (min_v + max_v) / 2 >> 8;

	int32_t best_error = *p_error;
	uint64_t best_bits = 0;

	for (uint32_t table_index = 0; table_index < 16; table_index++) {
	for (uint32_t multiplier = 0; multiplier < 16; multiplier++) {
	eac_mode_dec mode = EacDecodeInit(tables, base_codeword, table_index, multiplier);
	int32_t err = best_error;
	uint64_t pixel_bits = EacCompressFit(mode, block_u11, &err);
	if (err < best_error) {
	best_error = err;
	best_bits = pixel_bits
	\| (uint64_t)base_codeword << 56
	\| (uint64_t)multiplier << 52
	\| (uint64_t)table_index << 48;
	}
	}
	}

	*p_error = best_error;
	return best_bits;
	}

	typedef struct image {
	uint16_t *pixels;
	uint32_t stride, channels;
	uint32_t width, height;
	} image;

	uint16_t ImageGetU16(const image *img, uint32_t x, uint32_t y, uint32_t c)
	{
	x = MinU32(x, img->width - 1);
	y = MinU32(y, img->height - 1);
	return img->pixels[(y * img->stride + x) * img->channels + c];
	}

	void EacCompressBlock(const eac_tables tables, void dst, const image *img, uint32_t block_x, uint32_t block_y, uint32_t channel)
	{
	uint16_t block[16];
	uint32_t base_x = block_x * 4;
	uint32_t base_y = block_y * 4;

	// ETC block layout is vertical so need to load transposed
	for (uint32_t y = 0; y < 4; y++) {
	for (uint32_t x = 0; x < 4; x++) {
	block[x * 4 + y] = ImageGetU16(img, base_x + x, base_y + y, channel);
	}
	}

	int32_t error = INT32_MAX;
	uint64_t result = EacCompressSimple(tables, block, &error);

	char dst_p = (char)dst;
	for (uint32_t i = 0; i < 8; i++) {
	dst_p[i] = (char)(result >> (56 - i * 8));
	}
	}

	int CompareU16(const void va, const void vb)
	{
	const uint16_t a = (const uint16_t)va, b = (const uint16_t)vb;
	if (a != b) return a < b ? -1 : 1;
	return 0;
	}

	uint64_t BC4CompressFit(const uint16_t src, int32_t lo, int32_t hi, bool swap, int32_t p_error)
	{
	if ((hi >> 8) == (lo >> 8)) {
	if (hi < 0x8000) {
	hi += 0x100;
	} else {
	lo -= 0x100;
	}
	}

	int32_t values[8];
	if (swap) {
	values[0] = lo;
	values[1] = hi;
	values[2] = (4lo + 1hi) / 5;
	values[3] = (3lo + 2hi) / 5;
	values[4] = (2lo + 3hi) / 5;
	values[5] = (1lo + 4hi) / 5;
	values[6] = 0;
	values[7] = 0xffff;
	} else {
	values[0] = hi;
	values[1] = lo;
	values[2] = (6hi + 1lo) / 7;
	values[3] = (5hi + 2lo) / 7;
	values[4] = (4hi + 3lo) / 7;
	values[5] = (3hi + 4lo) / 7;
	values[6] = (2hi + 5lo) / 7;
	values[7] = (1hi + 6lo) / 7;
	}

	uint64_t bits = 0;
	int32_t total_err = 0;
	int32_t max_err = *p_error;
	for (uint32_t px = 0; px < 16; px++) {
	int32_t best_err = INT32_MAX;
	uint32_t best_ti = 0;
	int32_t ref = src[px];

	for (uint32_t ti = 0; ti < 8; ti++) {
	int32_t err = AbsI32(ref - values[ti]);
	if (err < best_err) {
	best_err = err;
	best_ti = ti;
	}
	}

	bits \|= (uint64_t)best_ti << (px * 3);
	total_err += best_err;
	if (total_err >= max_err) return 0;
	}

	uint32_t a = values[0] >> 8;
	uint32_t b = values[1] >> 8;

	*p_error = total_err;
	return (bits << 16) \| (b << 8) \| (a);
	}

	uint64_t BC4CompressSimple(const uint16_t src, int32_t p_error)
	{
	uint16_t sorted[16];
	memcpy(sorted, src, 16 * sizeof(uint16_t));
	qsort(sorted, 16, sizeof(uint16_t), &CompareU16);

	int32_t best_error = *p_error;
	uint64_t best_bits = 0;

	for (uint32_t drop_lo = 0; drop_lo < 15; drop_lo++) {
	for (uint32_t drop_hi = 0; drop_hi < 15 - drop_lo; drop_hi++) {
	int32_t lo = sorted[drop_lo];
	int32_t hi = sorted[15 - drop_hi];

	for (uint32_t swap = 0; swap < 2; swap++) {
	int32_t err = best_error;
	uint64_t bits = BC4CompressFit(src, lo, hi, swap != 0, &err);
	if (err < best_error) {
	best_error = err;
	best_bits = bits;
	}
	}
	}
	}

	*p_error = best_error;
	return best_bits;
	}

	void BC4CompressBlock(void dst, const image img, uint32_t block_x, uint32_t block_y, uint32_t channel)
	{
	uint16_t block[16];
	uint32_t base_x = block_x * 4;
	uint32_t base_y = block_y * 4;

	for (uint32_t y = 0; y < 4; y++) {
	for (uint32_t x = 0; x < 4; x++) {
	block[y * 4 + x] = ImageGetU16(img, base_x + x, base_y + y, channel);
	}
	}

	int32_t error = INT32_MAX;
	uint64_t result = BC4CompressSimple(block, &error);

	char dst_p = (char)dst;
	for (uint32_t i = 0; i < 8; i++) {
	dst_p[i] = (char)(result >> (i * 8));
	}
	}

	image LoadImage(const char *path, uint32_t req_channels)
	{
	image img = { NULL };

	int width, height, channels;
	uint16_t *pixels = stbi_load_16(path, &width, &height, &channels, (int)req_channels);
	if (!pixels) return img;

	img.pixels = pixels;
	img.width = width;
	img.height = height;
	img.stride = width * channels;
	img.channels = channels;
	return img;
	}

	void CrunchImage(image *img, double scale, double bias)
	{
	uint32_t count = img->width * img->height * img->channels;
	for (uint32_t i = 0; i < count; i++) {
	double value = (double)img->pixels[i];
	double v = value * scale + bias;
	if (v < 0.0) v = 0.0;
	if (v > 65535.0) v = 65535.0;
	img->pixels[i] = (uint16_t)v;
	}
	}

	image ResizeImage(const image *img, uint32_t width, uint32_t height, stbir_filter filter)
	{
	image res;
	res.pixels = (uint16_t)malloc(width height * img->channels * sizeof(uint16_t));
	res.width = width;
	res.height = height;
	res.channels = img->channels;
	res.stride = res.width * res.channels;

	stbir_resize_uint16_generic(
	img->pixels, (int)img->width, (int)img->height, (int)img->stride * sizeof(uint16_t),
	res.pixels, (int)res.width, (int)res.height, (int)res.stride * sizeof(uint16_t),
	(int)res.channels, STBIR_ALPHA_CHANNEL_NONE, 0, STBIR_EDGE_CLAMP, filter,
	STBIR_COLORSPACE_SRGB, NULL);

	return res;
	}

	image PadImage(const image *img, uint32_t width, uint32_t height)
	{
	uint32_t channels = img->channels;

	image res;
	res.width = width;
	res.height = height;
	res.channels = channels;
	res.stride = res.width * res.channels;
	res.pixels = (uint16_t)malloc(width height * channels * sizeof(uint16_t));

	for (uint32_t y = 0; y < height; y++) {
	for (uint32_t x = 0; x < width; x++) {
	uint16_t dst = res.pixels + y res.stride + x * channels;
	for (uint32_t c = 0; c < channels; c++) {
	dst[c] = ImageGetU16(img, x, y, c);
	}
	}
	}

	return res;
	}

	void FreeImage(image *img)
	{
	free(img->pixels);
	memset(img, 0, sizeof(image));
	}

	void WriteData(FILE f, const void data, size_t size)
	{
	if (size == 0) return;
	fwrite(data, 1, size, f);
	}

	void WriteU32(FILE *f, uint32_t v)
	{
	uint8_t bytes[4];
	for (uint32_t i = 0; i < 4; i++)
	bytes[i] = v >> (i * 8);
	WriteData(f, bytes, 4);
	}

	typedef struct ktx_mip {
	const uint8_t *data;
	uint32_t size;
	} ktx_mip;

	typedef struct ktx_key_value {
	const char *key;
	const void *value;
	uint32_t value_size;
	} ktx_key_value;

	typedef struct ktx_header {
	uint32_t gl_type;
	uint32_t gl_type_size;
	uint32_t gl_format;
	uint32_t gl_internal_format;
	uint32_t gl_base_internal_format;
	uint32_t pixel_width;
	uint32_t pixel_height;
	uint32_t pixel_depth;
	uint32_t layer_count;
	uint32_t face_count;
	uint32_t level_count;

	const ktx_key_value *key_values;
	uint32_t key_value_count;
	} ktx_header;

	void WriteKTX(FILE f, const ktx_header header, const ktx_mip *mips, size_t num_mips)
	{
	const uint8_t magic[12] = {
	0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A,
	};
	WriteData(f, magic, 12);
	WriteU32(f, 0x04030201); // endianness
	WriteU32(f, header->gl_type); // glType
	WriteU32(f, header->gl_type_size); // glTypeSize
	WriteU32(f, header->gl_format); // glFormat
	WriteU32(f, header->gl_internal_format); // glInternalFormat: COMPRESSED_R11_EAC
	WriteU32(f, header->gl_base_internal_format); // glBaseInternalFormat: GL_RED
	WriteU32(f, header->pixel_width); // pixelWidth
	WriteU32(f, header->pixel_height); // pixelHeight
	WriteU32(f, header->pixel_depth); // pixelDepth
	WriteU32(f, header->layer_count); // layerCount
	WriteU32(f, header->face_count); // faceCount
	WriteU32(f, header->level_count); // levelCount

	uint32_t key_value_size = 0;
	for (uint32_t i = 0; i < header->key_value_count; i++) {
	const ktx_key_value *kv = &header->key_values[i];
	uint32_t key_size = (uint32_t)strlen(kv->key) + 1;
	key_value_size += key_size;
	key_value_size += kv->value_size;
	key_value_size += (4 - (key_size + kv->value_size) % 4) % 4;
	}
	WriteU32(f, key_value_size);
	for (uint32_t i = 0; i < header->key_value_count; i++) {
	const ktx_key_value *kv = &header->key_values[i];
	uint32_t key_size = (uint32_t)strlen(kv->key) + 1;
	WriteU32(f, key_size + kv->value_size);
	WriteData(f, kv->key, key_size);
	WriteData(f, kv->value, kv->value_size);
	WriteData(f, "\0\0\0", (4 - (key_size + kv->value_size) % 4) % 4);
	}

	assert(num_mips == MaxU32(header->layer_count, 1) * header->face_count * header->level_count);
	for (uint32_t i = 0; i < num_mips; i++) {
	WriteU32(f, mips[i].size);
	WriteData(f, mips[i].data, mips[i].size);
	}
	}

	bool SaveKTX(const char path, const ktx_header header, const ktx_mip *mips, size_t num_mips)
	{
	FILE *f = fopen(path, "wb");
	if (!f) return false;
	WriteKTX(f, header, mips, num_mips);
	return fclose(f) == 0;
	}

	int main(int argc, char **argv)
	{
	uint32_t mip_count = 3;

	image original_img = LoadImage(argv[1], 1);

	uint32_t alignment = 4 << (mip_count - 1);

	uint32_t original_width = original_img.width;
	uint32_t original_height = original_img.height;

	uint32_t top_width = (original_width + alignment - 1) & ~(alignment - 1);
	uint32_t top_height = (original_height + alignment - 1) & ~(alignment - 1);

	image top_img = PadImage(&original_img, top_width, top_height);
	FreeImage(&original_img);

	ktx_mip mips_eac[16];
	ktx_mip mips_bc4[16];

	eac_tables tables;
	EacInitTables(&tables);

	for (uint32_t mip_ix = 0; mip_ix < mip_count; mip_ix++) {

	image img;
	if (mip_ix == 0) {
	img = top_img;
	} else {
	uint32_t src_width = ((top_width + (1 << mip_ix)/2) >> mip_ix);
	uint32_t src_height = ((top_height + (1 << mip_ix)/2) >> mip_ix);
	img = ResizeImage(&top_img, src_width, src_height, STBIR_FILTER_CATMULLROM);
	}

	uint32_t mip_width = top_width >> mip_ix;
	uint32_t mip_height = top_height >> mip_ix;
	assert(mip_width % 4 == 0);
	assert(mip_height % 4 == 0);
	uint32_t blocks_x = mip_width / 4;
	uint32_t blocks_y = mip_height / 4;
	uint32_t block_size = 8;

	uint8_t result_eac = calloc(block_size, blocks_x blocks_y);
	uint8_t result_bc4 = calloc(block_size, blocks_x blocks_y);

	for (uint32_t y = 0; y < blocks_y; y++) {
	for (uint32_t x = 0; x < blocks_x; x++) {
	{
	uint8_t dst = result_eac + yblocks_xblock_size + xblock_size;
	EacCompressBlock(&tables, dst, &img, x, y, 0);
	}
	{
	uint8_t dst = result_bc4 + yblocks_xblock_size + xblock_size;
	BC4CompressBlock(dst, &img, x, y, 0);
	}
	}
	printf("%u/%u\n", y+1, blocks_y);
	}

	mips_eac[mip_ix].data = result_eac;
	mips_eac[mip_ix].size = blocks_x * blocks_y * block_size;

	mips_bc4[mip_ix].data = result_bc4;
	mips_bc4[mip_ix].size = blocks_x * blocks_y * block_size;

	if (mip_ix > 0) {
	FreeImage(&img);
	}
	}

	FreeImage(&top_img);

	ktx_header ktx_base = { 0 };
	ktx_base.gl_type = 0;
	ktx_base.gl_type_size = 1;
	ktx_base.gl_format = 0;
	ktx_base.pixel_width = top_width;
	ktx_base.pixel_height = top_height;
	ktx_base.pixel_depth = 0;
	ktx_base.layer_count = 0;
	ktx_base.face_count = 1;
	ktx_base.level_count = mip_count;

	char mango_json[512];
	snprintf(mango_json, sizeof(mango_json),
	"{ \"originalSize\": { \"x\": %u, \"y\": %u } }",
	original_width, original_height);

	ktx_key_value key_values[] = {
	{ "mango:json", mango_json, strlen(mango_json) + 1 },
	};
	ktx_base.key_values = key_values;
	ktx_base.key_value_count = ArraySize(key_values);

	{
	ktx_header ktx_head = ktx_base;
	ktx_head.gl_internal_format = 0x9270; // COMPRESSED_R11_EAC
	ktx_head.gl_base_internal_format = 0x1903; // GL_RED
	SaveKTX("test_eac.ktx", &ktx_head, mips_eac, mip_count);
	}

	{
	ktx_header ktx_head = ktx_base;
	ktx_head.gl_internal_format = 0x8DBB; // GL_COMPRESSED_RED_RGTC1_EXT
	ktx_head.gl_base_internal_format = 0x1903; // GL_RED
	SaveKTX("test_bc4.ktx", &ktx_head, mips_bc4, mip_count);
	}

	return 0;
	}