SirKane/Image.cpp Secret

## Image.cpp
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <stdint.h>

#include "../Shared/Buffer.h"
#include "../Shared/FileHelpers.h"
#include "../Shared/BitStreams.h"

enum class eWCTType : uint8_t{
	zlib = 0x12,
	lzss = 0x13,
};
struct SWCTHeader{
	eWCTType	CompressionType; //0x12 or 0x13  @0
	uint8_t		Version; //?, @1
	uint8_t		BitCount; //@2
	uint8_t		Unknown1; //@3
	uint16_t	Width; //@4
	uint16_t	Height; //@6
	uint16_t	YChannelQuantizationScale; //@8
	uint16_t	CbChannelQuantizationScale; //@A
	uint16_t	CrChannelQuantizationScale; //@C
	uint16_t	AChannelQuantizationScale; //@E
	uint32_t	BlockSize0; //@10
	uint32_t	BlockSize1; //@14
	uint32_t	Unknown6; //@18
};


class CTPWBitReader{
protected:
	const CDynMemoryBuffer&	m_Buffer;
	size_t				m_Offset = 0;
	size_t				m_BitsRemaining = 0;
	uint32_t			m_Value = 0;
public:
	CTPWBitReader(const CDynMemoryBuffer&buffer) : m_Buffer(buffer){
	}
	uint16_t GetBits(size_t length){


		/*const uint16_t value = GetBitsRTL<uint16_t>(m_Buffer.GetBytePtr(), m_Offset, length);
		m_Offset += length;
		return value;*/

		size_t bitsRemaining = m_BitsRemaining;
		if (bitsRemaining < length){
			uint32_t value = *(const uint32_t*)(m_Buffer.GetBytePtr() + m_Offset) & 0xFFFF;
			m_Offset += 2;
			//LogFormatedA("Fetch %.4X\r\n", value);
			m_Value = m_Value | (value << m_BitsRemaining);
			m_BitsRemaining += 16;
		}
		const uint16_t returnValue = m_Value & ((1 << length) - 1);
		m_BitsRemaining -= length;
		m_Value >>= length;
		return returnValue;
	}
	size_t GetOffset() const{
		return m_Offset;
	}
};


uint32_t GetAlignedSize(uint32_t size){

	uint32_t value = (size - 1);
	uint32_t n = 1;
	for (; (value >>= 1) != 0;){
		++n;
	}
	if (n < 3){
		n = 3;
	}
	return 1 << n;
}

static const float g_Scale0 = (float)(((sqrt(3.0) + 1.0) / 4.0) / sqrt(2));
static const float g_Scale1 = (float)(((sqrt(3.0) + 3.0) / 4.0) / sqrt(2));
static const float g_Scale2 = (float)(((3.0 - sqrt(3.0)) / 4.0) / sqrt(2));
static const float g_Scale3 = (float)(((1.0 - sqrt(3.0)) / 4.0) / sqrt(2));

static const float g_Scale0_ = (float)(((sqrt(3.0) + 1.0) / 4.0));
static const float g_Scale1_ = (float)(((sqrt(3.0) + 3.0) / 4.0));
static const float g_Scale2_ = (float)(((3.0 - sqrt(3.0)) / 4.0));
static const float g_Scale3_ = (float)(((1.0 - sqrt(3.0)) / 4.0));

struct SD4Coefficients{
	float	H0;
	float	H1;
	float	H2;
	float	H3;

	float	G0;
	float	G1;
	float	G2;
	float	G3;

	float	IH0;
	float	IH1;
	float	IH2;
	float	IH3;

	float	IG0;
	float	IG1;
	float	IG2;
	float	IG3;
};
static void InitCoefficients(SD4Coefficients &coefs){

	const double s3 = sqrt(3.0);
	const double denom = 4 * sqrt(2.0);

	coefs.H0 = (float)((1 + s3) / denom);
	coefs.H1 = (float)((3 + s3) / denom);
	coefs.H2 = (float)((3 - s3) / denom);
	coefs.H3 = (float)((1 - s3) / denom);

	coefs.G0 = coefs.H3;
	coefs.G1 = -coefs.H2;
	coefs.G2 = coefs.H1;
	coefs.G3 = -coefs.H0;

	coefs.IH0 = coefs.H2;
	coefs.IH1 = coefs.G2;
	coefs.IH2 = coefs.H0;
	coefs.IH3 = coefs.G0;

	coefs.IG0 = coefs.H3;
	coefs.IG1 = coefs.G3;
	coefs.IG2 = coefs.H1;
	coefs.IG3 = coefs.G1;
}
static void InitCoefficientsWithScale(SD4Coefficients &coefs, float scale){

	const double s3 = sqrt(3.0);
	const double denom = 4 * sqrt(2.0);

	//scale = 1.0f;

	coefs.H0 = (float)((1 + s3) / denom) * scale;
	coefs.H1 = (float)((3 + s3) / denom) * scale;
	coefs.H2 = (float)((3 - s3) / denom) * scale;
	coefs.H3 = (float)((1 - s3) / denom) * scale;

	coefs.G0 = coefs.H3;
	coefs.G1 = -coefs.H2;
	coefs.G2 = coefs.H1;
	coefs.G3 = -coefs.H0;

	coefs.IH0 = coefs.H2;
	coefs.IH1 = coefs.G2;
	coefs.IH2 = coefs.H0;
	coefs.IH3 = coefs.G0;

	coefs.IG0 = coefs.H3;
	coefs.IG1 = coefs.G3;
	coefs.IG2 = coefs.H1;
	coefs.IG3 = coefs.G1;
}


static inline float FetchFloat(const int8_t* &pData){
	int32_t value = *(pData++);
	if (value == CHAR_MIN){
		value = *(int16_t*)(pData);
		pData += 2;
	}
	return (float)value;
}

#include "../Shared/VectorMath.h"


inline float ApplyScalingCoefficientsInv(float valueA, float valueB, float prevValueA, float prevValueB,
	const SD4Coefficients &coefs){
	return prevValueA * coefs.IH0 + prevValueB * coefs.IH1 + valueA * coefs.IH2 + valueB * coefs.IH3;
}
inline float ApplyWaveCoefficientsInv(float valueA, float valueB, float prevValueA, float prevValueB,
	const SD4Coefficients &coefs){
	return prevValueA * coefs.IG0 + prevValueB * coefs.IG1 + valueA * coefs.IG2 + valueB * coefs.IG3;
}

inline float ApplyScalingCoefficientsInv2(float smoothVal, float coef,
	float previousSmoothVal, float previousCoef, const SD4Coefficients &coefs){
	return previousSmoothVal * coefs.IH0 + previousCoef * coefs.IH1 +
		smoothVal * coefs.IH2 + coef * coefs.IH3;
}
inline float ApplyWaveCoefficientsInv2(float smoothVal, float coef,
	float previousSmoothVal, float previousCoef, const SD4Coefficients &coefs){
	return previousSmoothVal * coefs.IG0 + previousCoef * coefs.IG1 +
		smoothVal * coefs.IG2 + coef * coefs.IG3;
}

inline float ApplyScalingCoefficientsInv3(float smooth0, float coef0,
	float smooth1, float coef1, const SD4Coefficients &coefs){
	return smooth0 * coefs.IH0 + coef0 * coefs.IH1 +
		smooth1 * coefs.IH2 + coef1 * coefs.IH3;
}
inline float ApplyWaveCoefficientsInv3(float smooth0, float coef0,
	float smooth1, float coef1, const SD4Coefficients &coefs){
	return smooth0 * coefs.IG0 + coef0 * coefs.IG1 +
		smooth1 * coefs.IG2 + coef1 * coefs.IG3;
}


/*
fetchFn needs to return float
*/

template<typename T>
void D4InverseTransform(T &fetchFn, size_t size, float* pOutput, const SD4Coefficients &coefs){
	size_t i, j, outIndex = 0;
	size_t halfSize = size / 2;
	for (i = 0; i < size; ++i){
		const float valueA = fetchFn();
		const float valueB = fetchFn();
		float previousA = valueA;
		float previousB = valueB;


		for (j = 1; j < halfSize; ++j){
			const float valueA = fetchFn();
			const float valueB = fetchFn();
			pOutput[outIndex + j * 2] = ApplyScalingCoefficientsInv(valueA, valueB, previousA, previousB,
				coefs);
			pOutput[outIndex + j * 2 + 1] = ApplyWaveCoefficientsInv(valueA, valueB, previousA, previousB,
				coefs);


			previousA = valueA;
			previousB = valueB;
		}

		pOutput[outIndex+0] = ApplyScalingCoefficientsInv(valueA, valueB, previousA, previousB,
			coefs);
		pOutput[outIndex + 1] = ApplyWaveCoefficientsInv(valueA, valueB, previousA, previousB,
			coefs);

		outIndex += size;
	}
}

void D4InverseTransform2(const float* pSrc, float* pDest, size_t size, const SD4Coefficients &coefs){

	size_t halfSize = size / 2;
	size_t i, j;

	size_t offsetA = 0;
	size_t offsetB = size;
	size_t offsetC = size * halfSize;
	size_t offsetD = size + size * halfSize;

	size_t outputIndexStart = size * 2;
	size_t inputIndexStart = 0;

	for (i = 0; i < halfSize - 1; ++i){

		for (j = 0; j < size - 1; ++j){
			pDest[outputIndexStart + j * 2] = ApplyScalingCoefficientsInv(
				pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
				pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
				coefs
			);
			pDest[outputIndexStart + j * 2 + 1] = ApplyWaveCoefficientsInv(
				pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
				pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
				coefs
			);
		}

		pDest[outputIndexStart + j * 2] = ApplyScalingCoefficientsInv(
			pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
			pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
			coefs
		);
		pDest[outputIndexStart + j * 2 + 1] = ApplyWaveCoefficientsInv(
			pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
			pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
			coefs
		);
	}
}

enum class eD4Component {
	Scale,
	Wavelet,
};
/*
T = size_t (size_t index, eD4Component component)
T2 = size_T (size_t index, eD4Component component)
*/
template<typename T, typename T2>
inline void D4InverseTransform3(T &indexLookup, T2 &outputIndexLookup, const float* pSrc, float* pDest, size_t size,
	const SD4Coefficients &coefs){
	size_t i;
	for (i = 0; i < size; ++i){
		float s0, w0, s1, w1;
		uint32_t s0i, w0i, s1i, w1i;
		if (i == 0){
			s0i = indexLookup(size - 1, eD4Component::Scale);
			w0i = indexLookup(size - 1, eD4Component::Wavelet);
		} else {
			s0i = indexLookup(i-1, eD4Component::Scale);
			w0i = indexLookup(i-1, eD4Component::Wavelet);
		}
		s1i = indexLookup(i, eD4Component::Scale);
		w1i = indexLookup(i, eD4Component::Wavelet);

		s0 = pSrc[s0i];
		w0 = pSrc[w0i];
		s1 = pSrc[s1i];
		w1 = pSrc[w1i];


		uint32_t s0d = outputIndexLookup(i, eD4Component::Scale);
		uint32_t w0d = outputIndexLookup(i, eD4Component::Wavelet);

		pDest[s0d] = ApplyScalingCoefficientsInv3(s0, w0, s1, w1, coefs);
		pDest[w0d] = ApplyWaveCoefficientsInv3(s0, w0, s1, w1, coefs);
		/*LogFormatedA("s0 = %.8X, w0 = %.8X, s1 = %.8X, w1 = %.8X :: s0d = %.8X, w0d = %.8X\r\n",
			s0i, w0i, s1i, w1i, s0d, w0d);*/
	}
}

void FloatTest(){
	int32_t table_a[0x100];
	uint8_t table_b[0x200];
	uint32_t i;
	for (i = 0; i < 256; ++i){
		table_a[i] = (int32_t)((double)((int32_t)i - 0x80) * 1.772);

	}

	for (i = 0; i < 256; ++i){
		int32_t n = (int32_t)i - 128;
		if (n < 0){
			n = 0;
		} else if (n > 255){
			n = 255;
		}
		table_b[i] = (uint8_t)n;
	}

	const float sub = -1.2582912e7f;
	float f;
	for (f = 0.0f; f < 256.0f; f += 1.0f){

		float t = f - sub;
		uint32_t bits = *(uint32_t*)&t;

		int16_t o16 = (int16_t)bits;
		int32_t o8 = ((int8_t)bits);
		int32_t o32 = table_a[128 + o8];
		uint8_t val = table_b[o16 + o32 + 128];
		LogFormatedA("%G -> %u\r\n", f, val);
	}
}
#include "../Shared/Array.h"

struct SImageDecodeState{

	TDynArray<float>	DequantizationBuffer;
	TDynArray<float>	RowDecodeBuffer;
};

bool DecodeChannel(SImageDecodeState &state, size_t size,
	const int8_t*& pSrc, const int8_t* pSrcEnd, TDynArray<float> &outputBuffer,
	float dequantizationScale){

	state.DequantizationBuffer.Resize(size * size);
	state.RowDecodeBuffer.Resize(size*size);
	outputBuffer.Resize(size*size);
	size_t i, count;
	count = size * (size / 2);

	float *pRowDecodeBuffer = state.RowDecodeBuffer.begin();
	float *pDequantizationBuffer = state.DequantizationBuffer.begin();
	float* pOutputBuffer = outputBuffer.begin();

	//Step 1: dequantize
	for (i = 0; i < count; ++i){

		if (pSrcEnd - pSrc < sizeof(int8_t)){
			return false;
		}
		int32_t val = *(pSrc++);
		if (val == CHAR_MIN){
			if (pSrcEnd - pSrc < sizeof(int16_t)){
				return false;
			}
			val = *(int16_t*)pSrc;
			pSrc += sizeof(int16_t);
		}

		state.DequantizationBuffer[i * 2] = (float)val;


		if (pSrcEnd - pSrc < sizeof(int8_t)){
			return false;
		}
		val = *(pSrc++);
		if (val == CHAR_MIN){
			if (pSrcEnd - pSrc < sizeof(int16_t)){
				return false;
			}
			val = *(int16_t*)pSrc;
			pSrc += sizeof(int16_t);
		}

		state.DequantizationBuffer[i * 2 + 1] = (float)val;
	}


	//Step 2: decode rows

	SD4Coefficients coefs;
	InitCoefficientsWithScale(coefs, dequantizationScale);
	for (i = 0; i < size; ++i){
		D4InverseTransform3(
			[i, size](size_t index, eD4Component component) -> size_t{
			const size_t baseIndex = index * 2;
			switch (component){
			case eD4Component::Scale:
				return baseIndex + 0;
			case eD4Component::Wavelet:
				return baseIndex + 1;
			default:
				__assume(false);
			}
		},
		[](size_t index, eD4Component component){
			switch (component){
			case eD4Component::Scale:
				return index * 2 + 0;
			case eD4Component::Wavelet:
				return index * 2 + 1;
			default:
				__assume(false);
			}

		}, pDequantizationBuffer + i * size,
			pRowDecodeBuffer + i * size, size/ 2, coefs);
	}

	//Step 3: decode columns


	InitCoefficients(coefs);
	for (i = 0; i < size; ++i){

		/*const size_t s0offset = 0;
		const size_t s1offset = s0offset + maxSize;
		const size_t w0offset = maxSize * (maxSize - 1);
		const size_t w1offset = w0offset + maxSize;*/
		const size_t sOffset = 0;
		const size_t wOffset = size * (size / 2);
		const size_t colOffset = i;
		D4InverseTransform3(
			[colOffset, sOffset, wOffset, size](size_t index, eD4Component component) -> size_t{
			const size_t baseIndex = index * size;
			switch (component){
			case eD4Component::Scale:
				return baseIndex + colOffset + sOffset;
			case eD4Component::Wavelet:
				return baseIndex + colOffset + wOffset;
			default:
				__assume(false);
			}
		},
			[size, colOffset](size_t index, eD4Component component){
			size_t baseIndex = index * 2 * size + colOffset;
			switch (component){
			case eD4Component::Scale:
				return baseIndex;
			case eD4Component::Wavelet:
				return baseIndex + size;
			default:
				__assume(false);
			}

		}, pRowDecodeBuffer, pOutputBuffer, size / 2, coefs);
	}
	return true;
}


bool WCTDecompressLZSS(const CDynMemoryBuffer &inputBuffer, CDynMemoryBuffer &outputBuffer){

	size_t bitOffset = 0;
	bool errored = false;
	auto ReadBits = [&inputBuffer, &bitOffset, &errored](size_t count) -> uint16_t {

		if (bitOffset + count > inputBuffer.GetSize() * 8){
			errored = true;
			return 0;
		}
		uint16_t value = GetBitsRTL<uint16_t>(inputBuffer.GetBytePtr(), bitOffset, count);
		bitOffset += count;
		return value;
	};

	constexpr size_t maxSize = 256 * 256 * 3 + 128 * 128 * 3 + 128 * 128 * 3;

	outputBuffer.Resize(maxSize);

	uint8_t* pDest = outputBuffer.GetBytePtr();

	uint32_t n;
	for (;;){
		uint16_t isDelta = ReadBits(1);

		if (errored){
			return false;
		}
		//One byte literal
		if (isDelta == 0){
			const uint8_t literalValue = (uint8_t)ReadBits(8);
			if (errored){
				return false;
			}
			*(pDest++) = literalValue;
		} else {
			const size_t offset = ReadBits(12);
			if (errored){
				return false;
			}

			if (offset == 0){
				break;
			}

			const size_t length = ReadBits(7) + 1;
			if (errored){
				return false;
			}

			if (offset > pDest - outputBuffer.GetBytePtr()){
				return false;
			}

			size_t i;
			const uint8_t* pSrc = pDest - offset;
			for (i = 0; i < length; ++i){
				*(pDest++) = *(pSrc++);
			}
		}
	}
	outputBuffer.Resize(pDest - outputBuffer.GetBytePtr());
	return true;
}

template<typename T> T tmax(const T a, const T b){
	return a > b ? a : b;
}
template<typename T> T tmin(const T a, const T b){
	return a < b ? a : b;
}


static inline float ComputeDequantizationScaleY(uint16_t n){
	return 1.0f - ((float)n * -0.5f);
}

static inline float ComputeDequantizationScaleCbCr(uint16_t n){
	return 1.0f - ((float)n * -0.25f);
}


static inline float ComputeDequantizationScaleA(uint16_t n){
	return (float)n + 1.0f;
}


bool DecodeImage(const CDynMemoryBuffer &inputBuffer, CDynMemoryBuffer &outputBuffer){

	SWCTHeader header;
	CDynMemoryReader r(&inputBuffer);
	if (!r.Get(header)){
		return false;
	}
	if (header.CompressionType != eWCTType::lzss &&
		header.CompressionType != eWCTType::zlib){
	}

	CDynMemoryBuffer blockBuffer, block0, block1;

	if (header.BlockSize0 == 0){
		return false;
	}
	blockBuffer.Resize(header.BlockSize0);

	if (!r.GetRaw(blockBuffer.GetBytePtr(), header.BlockSize0)){
		return false;
	}
	if (header.CompressionType == eWCTType::lzss){
		if (!WCTDecompressLZSS(blockBuffer, block0)){
			return false;
		}
	} else {
		return false;
	}


	if (header.BlockSize1 != 0 && header.BitCount == 32){
		blockBuffer.Resize(header.BlockSize1);

		if (!r.GetRaw(blockBuffer.GetBytePtr(), header.BlockSize1)){
			return false;
		}
		if (header.CompressionType == eWCTType::lzss){
			if (!WCTDecompressLZSS(blockBuffer, block1)){
				return false;
			}
		} else {
			return false;
		}
	}


	size_t size = tmax<size_t>(GetAlignedSize(tmax<size_t>(header.Height, header.Width)), 8u);

	const int8_t* pDecodeSource = (const int8_t*)block0.GetBytePtr();
	const int8_t* pEnd = pDecodeSource + block0.GetSize();

	SImageDecodeState state;
	TDynArray<float> outputY, outputCb, outputCr, outputA;

	if (!DecodeChannel(state, size, pDecodeSource, pEnd,
		outputY, ComputeDequantizationScaleY(header.YChannelQuantizationScale))){
		return false;
	}
	if (!DecodeChannel(state, size / 2, pDecodeSource, pEnd,
		outputCb, ComputeDequantizationScaleCbCr(header.CbChannelQuantizationScale))){
		return false;
	}
	if (!DecodeChannel(state, size / 2, pDecodeSource, pEnd,
		outputCr, ComputeDequantizationScaleCbCr(header.CrChannelQuantizationScale))){
		return false;
	}

	TDynArray<float> output;


	output.Resize(header.Width * header.Height * 3);

	uint32_t x, y;
	for (y = 0; y < header.Height; ++y){
		for (x = 0; x < header.Width; ++x){
			float cy = outputY[y * size + x];
			float cb = outputCb[((y / 2) * (size / 2) + (x / 2))];
			float cr = outputCr[((y / 2) * (size / 2) + (x / 2))];
			const float r = cy + 1.402f * (cr);
			const float g = cy - 0.344136f * (cb) - 0.714136f * (cr);
			const float b = cy + 1.772f * (cb);

			output[((y * header.Width + x) * 3)] = r;
			output[((y * header.Width + x) * 3) + 1] = g;
			output[((y * header.Width + x) * 3) + 2] = b;
		}
	}
	outputBuffer.Resize(sizeof(float) * header.Width * header.Height * 3);
	memcpy(outputBuffer.GetBytePtr(), output.begin(), sizeof(float)*output.Size());

	return true;
}

int main(int argc, const char*const*argv){

	//FloatTest();


	SD4Coefficients coefs;
	InitCoefficients(coefs);

	CDynMemoryBuffer textureFile, outputFile;
	LoadFileToBuffer("I:\\red.wct", textureFile);
	LoadFileToBuffer("I:\\TP\\Test.wct", textureFile);

	DecodeImage(textureFile, outputFile);
	WriteBufferToFile("I:\\TP\\ImageData.bin", outputFile);


	SWCTHeader header;
	CDynMemoryBuffer block0, block1;
	{
		CDynMemoryReader r(&textureFile);
		r.Get(header);
		block0.Resize(header.BlockSize0);
		block1.Resize(header.BlockSize1);

		r.GetRaw(block0.GetBytePtr(), header.BlockSize0);
		r.GetRaw(block1.GetBytePtr(), header.BlockSize1);
	}


	uint32_t alignedWidth = GetAlignedSize(header.Width);
	uint32_t alignedHeight = GetAlignedSize(header.Height);

	uint32_t maxSize = alignedWidth < alignedHeight ? alignedHeight : alignedWidth;
	uint32_t halfMaxSize = maxSize / 2;


	const float float8 = 1.0f - ((float)header.YChannelQuantizationScale * -0.5f);
	const float floatA = 1.0f - ((float)header.CbChannelQuantizationScale * -0.25f);
	const float floatC = 1.0f - ((float)header.CrChannelQuantizationScale * -0.25f);
	const float floatE = (float)header.AChannelQuantizationScale + 1.0f;


	CTPWBitReader br(block0);

	static uint8_t decodeBuffer[0x60000];
	uint8_t* pDest = decodeBuffer;

	for (;;){
		uint16_t isDelta = br.GetBits(1);
		//One byte literal
		if (isDelta == 0){
			uint8_t literalValue = (uint8_t)br.GetBits(8);
			LogFormatedA("Literal %.2X\r\n", literalValue);
			*(pDest++) = literalValue;
		} else {
			size_t offset = br.GetBits(12);
			size_t length = br.GetBits(7) + 1;

			if (offset == 0){
				LogFormatedA("End of stream\r\n");
				break;
			}

			LogFormatedA("Delta -%zu[%zu]\r\n", offset, length);

			size_t i;
			const uint8_t* pSrc = pDest - offset;
			for (i = 0; i < length; ++i){
				*(pDest++) = *(pSrc++);
			}

			//memmove(pDest, pDest - offset, length);
			//pDest += length;
		}
	}
	size_t len2 = pDest - decodeBuffer;


	static float s_FloatBuf0[0x10000];

	static float s_FloatBuf1[0x10000];

	uint32_t i, j;

	uint32_t count2 = halfMaxSize / 2 - 1;


	CDynMemoryBuffer dbuf;

	const int8_t* pDecodeSource = (const int8_t*)decodeBuffer;

	SImageDecodeState state;
	TDynArray<float> outputY, outputCb, outputCr, outputA;

	bool r = DecodeChannel(state, maxSize, pDecodeSource, (const int8_t*)decodeBuffer + len2,
		outputY, float8);
	bool g = DecodeChannel(state, maxSize/2, pDecodeSource, (const int8_t*)decodeBuffer + len2,
		outputCb, floatA);
	bool b = DecodeChannel(state, maxSize/2, pDecodeSource, (const int8_t*)decodeBuffer + len2,
		outputCr, floatC);


	TDynArray<float> output;


	output.Resize(header.Width * header.Height * 3);

	uint32_t x, y;
	for (y = 0; y < header.Height; ++y){
		for (x = 0; x < header.Width; ++x){
			float cy = outputY[y * maxSize + x];
			float cb = outputCb[((y / 2) * (maxSize / 2) + (x / 2))];
			float cr = outputCr[((y / 2) * (maxSize / 2) + (x / 2))];
			const float r = cy + 1.402f * (cr - 128.0f);
			const float g = cy - 0.344136f * (cb - 128.0f) - 0.714136f * (cr - 128.0f);
			const float b = cy + 1.772f * (cb - 128.0f);

			output[((y * header.Width + x) * 3)] = r;
			output[((y * header.Width + x) * 3)+1] = g;
			output[((y * header.Width + x) * 3)+2] = b;
		}
	}

	{
		FILE* f;
		fopen_s(&f, "I:\\TP\\ImageData.bin", "wb");
		fwrite(output.begin(), sizeof(float), output.Size(), f);
		fclose(f);
	}


	i = 0;

	/*{
		float float8_s0 = float8 * g_Scale0;
		float float8_s1 = float8 * g_Scale1;
		float float8_s2 = float8 * g_Scale2;
		float float8_s3 = float8 * g_Scale3;

		const int8_t* pSrc = (const int8_t*)decodeBuffer;
		CVec2f* pDest = s_FloatBuf0;
		for (i = 0; i < maxSize; ++i){

			const float val0f = FetchFloat(pSrc);
			const float val1f = FetchFloat(pSrc);


			CVec2f* pDest2 = pDest + 1;
			for (j = 0; j < count2; ++j){
				const float val0f = FetchFloat(pSrc);
				const float val1f = FetchFloat(pSrc);

				*(pDest2++) = CVec2f(
					val0f * float8_s2 + val1f * float8_s1 + val0f * float8_s0 + val1f * float8_s3,
					val0f * float8_s3 + val0f * float8_s1 - val1f * float8_s0 - val1f * float8_s2
				);
			}
			*pDest = CVec2f(
				val0f * float8_s2 + val1f * float8_s1 + val0f * float8_s0 + val1f * float8_s3,
				val0f * float8_s3 + val0f * float8_s1 - val1f * float8_s0 - val1f * float8_s2
			);


			pDest += maxSize/2;
		}
	}*/
	{
		float* pDest = s_FloatBuf1;

	}

	size_t len = pDest - decodeBuffer;


	uint32_t width = 63;

	uint32_t value = (width - 1);
/*	uint32_t n = 1;
	for (; (value >>= 1) != 0;){
		++n;
	}*/

	return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <conio.h>
	#include <stdint.h>

	#include "../Shared/Buffer.h"
	#include "../Shared/FileHelpers.h"
	#include "../Shared/BitStreams.h"

	enum class eWCTType : uint8_t{
	zlib = 0x12,
	lzss = 0x13,
	};
	struct SWCTHeader{
	eWCTType CompressionType; //0x12 or 0x13 @0
	uint8_t Version; //?, @1
	uint8_t BitCount; //@2
	uint8_t Unknown1; //@3
	uint16_t Width; //@4
	uint16_t Height; //@6
	uint16_t YChannelQuantizationScale; //@8
	uint16_t CbChannelQuantizationScale; //@A
	uint16_t CrChannelQuantizationScale; //@C
	uint16_t AChannelQuantizationScale; //@E
	uint32_t BlockSize0; //@10
	uint32_t BlockSize1; //@14
	uint32_t Unknown6; //@18
	};



	class CTPWBitReader{
	protected:
	const CDynMemoryBuffer& m_Buffer;
	size_t m_Offset = 0;
	size_t m_BitsRemaining = 0;
	uint32_t m_Value = 0;
	public:
	CTPWBitReader(const CDynMemoryBuffer&buffer) : m_Buffer(buffer){
	}
	uint16_t GetBits(size_t length){



	/*const uint16_t value = GetBitsRTL<uint16_t>(m_Buffer.GetBytePtr(), m_Offset, length);
	m_Offset += length;
	return value;*/

	size_t bitsRemaining = m_BitsRemaining;
	if (bitsRemaining < length){
	uint32_t value = (const uint32_t)(m_Buffer.GetBytePtr() + m_Offset) & 0xFFFF;
	m_Offset += 2;
	//LogFormatedA("Fetch %.4X\r\n", value);
	m_Value = m_Value \| (value << m_BitsRemaining);
	m_BitsRemaining += 16;
	}
	const uint16_t returnValue = m_Value & ((1 << length) - 1);
	m_BitsRemaining -= length;
	m_Value >>= length;
	return returnValue;
	}
	size_t GetOffset() const{
	return m_Offset;
	}
	};


	uint32_t GetAlignedSize(uint32_t size){

	uint32_t value = (size - 1);
	uint32_t n = 1;
	for (; (value >>= 1) != 0;){
	++n;
	}
	if (n < 3){
	n = 3;
	}
	return 1 << n;
	}

	static const float g_Scale0 = (float)(((sqrt(3.0) + 1.0) / 4.0) / sqrt(2));
	static const float g_Scale1 = (float)(((sqrt(3.0) + 3.0) / 4.0) / sqrt(2));
	static const float g_Scale2 = (float)(((3.0 - sqrt(3.0)) / 4.0) / sqrt(2));
	static const float g_Scale3 = (float)(((1.0 - sqrt(3.0)) / 4.0) / sqrt(2));

	static const float g_Scale0_ = (float)(((sqrt(3.0) + 1.0) / 4.0));
	static const float g_Scale1_ = (float)(((sqrt(3.0) + 3.0) / 4.0));
	static const float g_Scale2_ = (float)(((3.0 - sqrt(3.0)) / 4.0));
	static const float g_Scale3_ = (float)(((1.0 - sqrt(3.0)) / 4.0));

	struct SD4Coefficients{
	float H0;
	float H1;
	float H2;
	float H3;

	float G0;
	float G1;
	float G2;
	float G3;

	float IH0;
	float IH1;
	float IH2;
	float IH3;

	float IG0;
	float IG1;
	float IG2;
	float IG3;
	};
	static void InitCoefficients(SD4Coefficients &coefs){

	const double s3 = sqrt(3.0);
	const double denom = 4 * sqrt(2.0);

	coefs.H0 = (float)((1 + s3) / denom);
	coefs.H1 = (float)((3 + s3) / denom);
	coefs.H2 = (float)((3 - s3) / denom);
	coefs.H3 = (float)((1 - s3) / denom);

	coefs.G0 = coefs.H3;
	coefs.G1 = -coefs.H2;
	coefs.G2 = coefs.H1;
	coefs.G3 = -coefs.H0;

	coefs.IH0 = coefs.H2;
	coefs.IH1 = coefs.G2;
	coefs.IH2 = coefs.H0;
	coefs.IH3 = coefs.G0;

	coefs.IG0 = coefs.H3;
	coefs.IG1 = coefs.G3;
	coefs.IG2 = coefs.H1;
	coefs.IG3 = coefs.G1;
	}
	static void InitCoefficientsWithScale(SD4Coefficients &coefs, float scale){

	const double s3 = sqrt(3.0);
	const double denom = 4 * sqrt(2.0);

	//scale = 1.0f;

	coefs.H0 = (float)((1 + s3) / denom) * scale;
	coefs.H1 = (float)((3 + s3) / denom) * scale;
	coefs.H2 = (float)((3 - s3) / denom) * scale;
	coefs.H3 = (float)((1 - s3) / denom) * scale;

	coefs.G0 = coefs.H3;
	coefs.G1 = -coefs.H2;
	coefs.G2 = coefs.H1;
	coefs.G3 = -coefs.H0;

	coefs.IH0 = coefs.H2;
	coefs.IH1 = coefs.G2;
	coefs.IH2 = coefs.H0;
	coefs.IH3 = coefs.G0;

	coefs.IG0 = coefs.H3;
	coefs.IG1 = coefs.G3;
	coefs.IG2 = coefs.H1;
	coefs.IG3 = coefs.G1;
	}


	static inline float FetchFloat(const int8_t* &pData){
	int32_t value = *(pData++);
	if (value == CHAR_MIN){
	value = (int16_t)(pData);
	pData += 2;
	}
	return (float)value;
	}

	#include "../Shared/VectorMath.h"


	inline float ApplyScalingCoefficientsInv(float valueA, float valueB, float prevValueA, float prevValueB,
	const SD4Coefficients &coefs){
	return prevValueA * coefs.IH0 + prevValueB * coefs.IH1 + valueA * coefs.IH2 + valueB * coefs.IH3;
	}
	inline float ApplyWaveCoefficientsInv(float valueA, float valueB, float prevValueA, float prevValueB,
	const SD4Coefficients &coefs){
	return prevValueA * coefs.IG0 + prevValueB * coefs.IG1 + valueA * coefs.IG2 + valueB * coefs.IG3;
	}

	inline float ApplyScalingCoefficientsInv2(float smoothVal, float coef,
	float previousSmoothVal, float previousCoef, const SD4Coefficients &coefs){
	return previousSmoothVal * coefs.IH0 + previousCoef * coefs.IH1 +
	smoothVal * coefs.IH2 + coef * coefs.IH3;
	}
	inline float ApplyWaveCoefficientsInv2(float smoothVal, float coef,
	float previousSmoothVal, float previousCoef, const SD4Coefficients &coefs){
	return previousSmoothVal * coefs.IG0 + previousCoef * coefs.IG1 +
	smoothVal * coefs.IG2 + coef * coefs.IG3;
	}

	inline float ApplyScalingCoefficientsInv3(float smooth0, float coef0,
	float smooth1, float coef1, const SD4Coefficients &coefs){
	return smooth0 * coefs.IH0 + coef0 * coefs.IH1 +
	smooth1 * coefs.IH2 + coef1 * coefs.IH3;
	}
	inline float ApplyWaveCoefficientsInv3(float smooth0, float coef0,
	float smooth1, float coef1, const SD4Coefficients &coefs){
	return smooth0 * coefs.IG0 + coef0 * coefs.IG1 +
	smooth1 * coefs.IG2 + coef1 * coefs.IG3;
	}



	/*
	fetchFn needs to return float
	*/

	template<typename T>
	void D4InverseTransform(T &fetchFn, size_t size, float* pOutput, const SD4Coefficients &coefs){
	size_t i, j, outIndex = 0;
	size_t halfSize = size / 2;
	for (i = 0; i < size; ++i){
	const float valueA = fetchFn();
	const float valueB = fetchFn();
	float previousA = valueA;
	float previousB = valueB;


	for (j = 1; j < halfSize; ++j){
	const float valueA = fetchFn();
	const float valueB = fetchFn();
	pOutput[outIndex + j * 2] = ApplyScalingCoefficientsInv(valueA, valueB, previousA, previousB,
	coefs);
	pOutput[outIndex + j * 2 + 1] = ApplyWaveCoefficientsInv(valueA, valueB, previousA, previousB,
	coefs);


	previousA = valueA;
	previousB = valueB;
	}

	pOutput[outIndex+0] = ApplyScalingCoefficientsInv(valueA, valueB, previousA, previousB,
	coefs);
	pOutput[outIndex + 1] = ApplyWaveCoefficientsInv(valueA, valueB, previousA, previousB,
	coefs);

	outIndex += size;
	}
	}

	void D4InverseTransform2(const float* pSrc, float* pDest, size_t size, const SD4Coefficients &coefs){

	size_t halfSize = size / 2;
	size_t i, j;

	size_t offsetA = 0;
	size_t offsetB = size;
	size_t offsetC = size * halfSize;
	size_t offsetD = size + size * halfSize;

	size_t outputIndexStart = size * 2;
	size_t inputIndexStart = 0;

	for (i = 0; i < halfSize - 1; ++i){

	for (j = 0; j < size - 1; ++j){
	pDest[outputIndexStart + j * 2] = ApplyScalingCoefficientsInv(
	pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
	pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
	coefs
	);
	pDest[outputIndexStart + j * 2 + 1] = ApplyWaveCoefficientsInv(
	pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
	pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
	coefs
	);
	}

	pDest[outputIndexStart + j * 2] = ApplyScalingCoefficientsInv(
	pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
	pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
	coefs
	);
	pDest[outputIndexStart + j * 2 + 1] = ApplyWaveCoefficientsInv(
	pSrc[inputIndexStart + j + offsetB], pSrc[inputIndexStart + j + offsetD],
	pSrc[inputIndexStart + j + offsetA], pSrc[inputIndexStart + j + offsetC],
	coefs
	);
	}
	}

	enum class eD4Component {
	Scale,
	Wavelet,
	};
	/*
	T = size_t (size_t index, eD4Component component)
	T2 = size_T (size_t index, eD4Component component)
	*/
	template<typename T, typename T2>
	inline void D4InverseTransform3(T &indexLookup, T2 &outputIndexLookup, const float* pSrc, float* pDest, size_t size,
	const SD4Coefficients &coefs){
	size_t i;
	for (i = 0; i < size; ++i){
	float s0, w0, s1, w1;
	uint32_t s0i, w0i, s1i, w1i;
	if (i == 0){
	s0i = indexLookup(size - 1, eD4Component::Scale);
	w0i = indexLookup(size - 1, eD4Component::Wavelet);
	} else {
	s0i = indexLookup(i-1, eD4Component::Scale);
	w0i = indexLookup(i-1, eD4Component::Wavelet);
	}
	s1i = indexLookup(i, eD4Component::Scale);
	w1i = indexLookup(i, eD4Component::Wavelet);

	s0 = pSrc[s0i];
	w0 = pSrc[w0i];
	s1 = pSrc[s1i];
	w1 = pSrc[w1i];


	uint32_t s0d = outputIndexLookup(i, eD4Component::Scale);
	uint32_t w0d = outputIndexLookup(i, eD4Component::Wavelet);

	pDest[s0d] = ApplyScalingCoefficientsInv3(s0, w0, s1, w1, coefs);
	pDest[w0d] = ApplyWaveCoefficientsInv3(s0, w0, s1, w1, coefs);
	/*LogFormatedA("s0 = %.8X, w0 = %.8X, s1 = %.8X, w1 = %.8X :: s0d = %.8X, w0d = %.8X\r\n",
	s0i, w0i, s1i, w1i, s0d, w0d);*/
	}
	}

	void FloatTest(){
	int32_t table_a[0x100];
	uint8_t table_b[0x200];
	uint32_t i;
	for (i = 0; i < 256; ++i){
	table_a[i] = (int32_t)((double)((int32_t)i - 0x80) * 1.772);

	}

	for (i = 0; i < 256; ++i){
	int32_t n = (int32_t)i - 128;
	if (n < 0){
	n = 0;
	} else if (n > 255){
	n = 255;
	}
	table_b[i] = (uint8_t)n;
	}

	const float sub = -1.2582912e7f;
	float f;
	for (f = 0.0f; f < 256.0f; f += 1.0f){

	float t = f - sub;
	uint32_t bits = (uint32_t)&t;

	int16_t o16 = (int16_t)bits;
	int32_t o8 = ((int8_t)bits);
	int32_t o32 = table_a[128 + o8];
	uint8_t val = table_b[o16 + o32 + 128];
	LogFormatedA("%G -> %u\r\n", f, val);
	}
	}
	#include "../Shared/Array.h"

	struct SImageDecodeState{

	TDynArray<float> DequantizationBuffer;
	TDynArray<float> RowDecodeBuffer;
	};

	bool DecodeChannel(SImageDecodeState &state, size_t size,
	const int8_t& pSrc, const int8_t pSrcEnd, TDynArray<float> &outputBuffer,
	float dequantizationScale){

	state.DequantizationBuffer.Resize(size * size);
	state.RowDecodeBuffer.Resize(size*size);
	outputBuffer.Resize(size*size);
	size_t i, count;
	count = size * (size / 2);

	float *pRowDecodeBuffer = state.RowDecodeBuffer.begin();
	float *pDequantizationBuffer = state.DequantizationBuffer.begin();
	float* pOutputBuffer = outputBuffer.begin();

	//Step 1: dequantize
	for (i = 0; i < count; ++i){

	if (pSrcEnd - pSrc < sizeof(int8_t)){
	return false;
	}
	int32_t val = *(pSrc++);
	if (val == CHAR_MIN){
	if (pSrcEnd - pSrc < sizeof(int16_t)){
	return false;
	}
	val = (int16_t)pSrc;
	pSrc += sizeof(int16_t);
	}

	state.DequantizationBuffer[i * 2] = (float)val;


	if (pSrcEnd - pSrc < sizeof(int8_t)){
	return false;
	}
	val = *(pSrc++);
	if (val == CHAR_MIN){
	if (pSrcEnd - pSrc < sizeof(int16_t)){
	return false;
	}
	val = (int16_t)pSrc;
	pSrc += sizeof(int16_t);
	}

	state.DequantizationBuffer[i * 2 + 1] = (float)val;
	}



	//Step 2: decode rows

	SD4Coefficients coefs;
	InitCoefficientsWithScale(coefs, dequantizationScale);
	for (i = 0; i < size; ++i){
	D4InverseTransform3(
	[i, size](size_t index, eD4Component component) -> size_t{
	const size_t baseIndex = index * 2;
	switch (component){
	case eD4Component::Scale:
	return baseIndex + 0;
	case eD4Component::Wavelet:
	return baseIndex + 1;
	default:
	__assume(false);
	}
	},
	[](size_t index, eD4Component component){
	switch (component){
	case eD4Component::Scale:
	return index * 2 + 0;
	case eD4Component::Wavelet:
	return index * 2 + 1;
	default:
	__assume(false);
	}

	}, pDequantizationBuffer + i * size,
	pRowDecodeBuffer + i * size, size/ 2, coefs);
	}

	//Step 3: decode columns


	InitCoefficients(coefs);
	for (i = 0; i < size; ++i){

	/*const size_t s0offset = 0;
	const size_t s1offset = s0offset + maxSize;
	const size_t w0offset = maxSize * (maxSize - 1);
	const size_t w1offset = w0offset + maxSize;*/
	const size_t sOffset = 0;
	const size_t wOffset = size * (size / 2);
	const size_t colOffset = i;
	D4InverseTransform3(
	[colOffset, sOffset, wOffset, size](size_t index, eD4Component component) -> size_t{
	const size_t baseIndex = index * size;
	switch (component){
	case eD4Component::Scale:
	return baseIndex + colOffset + sOffset;
	case eD4Component::Wavelet:
	return baseIndex + colOffset + wOffset;
	default:
	__assume(false);
	}
	},
	[size, colOffset](size_t index, eD4Component component){
	size_t baseIndex = index * 2 * size + colOffset;
	switch (component){
	case eD4Component::Scale:
	return baseIndex;
	case eD4Component::Wavelet:
	return baseIndex + size;
	default:
	__assume(false);
	}

	}, pRowDecodeBuffer, pOutputBuffer, size / 2, coefs);
	}
	return true;
	}


	bool WCTDecompressLZSS(const CDynMemoryBuffer &inputBuffer, CDynMemoryBuffer &outputBuffer){

	size_t bitOffset = 0;
	bool errored = false;
	auto ReadBits = [&inputBuffer, &bitOffset, &errored](size_t count) -> uint16_t {

	if (bitOffset + count > inputBuffer.GetSize() * 8){
	errored = true;
	return 0;
	}
	uint16_t value = GetBitsRTL<uint16_t>(inputBuffer.GetBytePtr(), bitOffset, count);
	bitOffset += count;
	return value;
	};

	constexpr size_t maxSize = 256 * 256 * 3 + 128 * 128 * 3 + 128 * 128 * 3;

	outputBuffer.Resize(maxSize);

	uint8_t* pDest = outputBuffer.GetBytePtr();

	uint32_t n;
	for (;;){
	uint16_t isDelta = ReadBits(1);

	if (errored){
	return false;
	}
	//One byte literal
	if (isDelta == 0){
	const uint8_t literalValue = (uint8_t)ReadBits(8);
	if (errored){
	return false;
	}
	*(pDest++) = literalValue;
	} else {
	const size_t offset = ReadBits(12);
	if (errored){
	return false;
	}

	if (offset == 0){
	break;
	}

	const size_t length = ReadBits(7) + 1;
	if (errored){
	return false;
	}

	if (offset > pDest - outputBuffer.GetBytePtr()){
	return false;
	}

	size_t i;
	const uint8_t* pSrc = pDest - offset;
	for (i = 0; i < length; ++i){
	(pDest++) = (pSrc++);
	}
	}
	}
	outputBuffer.Resize(pDest - outputBuffer.GetBytePtr());
	return true;
	}

	template<typename T> T tmax(const T a, const T b){
	return a > b ? a : b;
	}
	template<typename T> T tmin(const T a, const T b){
	return a < b ? a : b;
	}


	static inline float ComputeDequantizationScaleY(uint16_t n){
	return 1.0f - ((float)n * -0.5f);
	}

	static inline float ComputeDequantizationScaleCbCr(uint16_t n){
	return 1.0f - ((float)n * -0.25f);
	}


	static inline float ComputeDequantizationScaleA(uint16_t n){
	return (float)n + 1.0f;
	}


	bool DecodeImage(const CDynMemoryBuffer &inputBuffer, CDynMemoryBuffer &outputBuffer){

	SWCTHeader header;
	CDynMemoryReader r(&inputBuffer);
	if (!r.Get(header)){
	return false;
	}
	if (header.CompressionType != eWCTType::lzss &&
	header.CompressionType != eWCTType::zlib){
	}

	CDynMemoryBuffer blockBuffer, block0, block1;

	if (header.BlockSize0 == 0){
	return false;
	}
	blockBuffer.Resize(header.BlockSize0);

	if (!r.GetRaw(blockBuffer.GetBytePtr(), header.BlockSize0)){
	return false;
	}
	if (header.CompressionType == eWCTType::lzss){
	if (!WCTDecompressLZSS(blockBuffer, block0)){
	return false;
	}
	} else {
	return false;
	}


	if (header.BlockSize1 != 0 && header.BitCount == 32){
	blockBuffer.Resize(header.BlockSize1);

	if (!r.GetRaw(blockBuffer.GetBytePtr(), header.BlockSize1)){
	return false;
	}
	if (header.CompressionType == eWCTType::lzss){
	if (!WCTDecompressLZSS(blockBuffer, block1)){
	return false;
	}
	} else {
	return false;
	}
	}


	size_t size = tmax<size_t>(GetAlignedSize(tmax<size_t>(header.Height, header.Width)), 8u);

	const int8_t* pDecodeSource = (const int8_t*)block0.GetBytePtr();
	const int8_t* pEnd = pDecodeSource + block0.GetSize();

	SImageDecodeState state;
	TDynArray<float> outputY, outputCb, outputCr, outputA;

	if (!DecodeChannel(state, size, pDecodeSource, pEnd,
	outputY, ComputeDequantizationScaleY(header.YChannelQuantizationScale))){
	return false;
	}
	if (!DecodeChannel(state, size / 2, pDecodeSource, pEnd,
	outputCb, ComputeDequantizationScaleCbCr(header.CbChannelQuantizationScale))){
	return false;
	}
	if (!DecodeChannel(state, size / 2, pDecodeSource, pEnd,
	outputCr, ComputeDequantizationScaleCbCr(header.CrChannelQuantizationScale))){
	return false;
	}

	TDynArray<float> output;


	output.Resize(header.Width * header.Height * 3);

	uint32_t x, y;
	for (y = 0; y < header.Height; ++y){
	for (x = 0; x < header.Width; ++x){
	float cy = outputY[y * size + x];
	float cb = outputCb[((y / 2) * (size / 2) + (x / 2))];
	float cr = outputCr[((y / 2) * (size / 2) + (x / 2))];
	const float r = cy + 1.402f * (cr);
	const float g = cy - 0.344136f * (cb) - 0.714136f * (cr);
	const float b = cy + 1.772f * (cb);

	output[((y * header.Width + x) * 3)] = r;
	output[((y * header.Width + x) * 3) + 1] = g;
	output[((y * header.Width + x) * 3) + 2] = b;
	}
	}
	outputBuffer.Resize(sizeof(float) * header.Width * header.Height * 3);
	memcpy(outputBuffer.GetBytePtr(), output.begin(), sizeof(float)*output.Size());

	return true;
	}

	int main(int argc, const charconstargv){

	//FloatTest();


	SD4Coefficients coefs;
	InitCoefficients(coefs);

	CDynMemoryBuffer textureFile, outputFile;
	LoadFileToBuffer("I:\\red.wct", textureFile);
	LoadFileToBuffer("I:\\TP\\Test.wct", textureFile);

	DecodeImage(textureFile, outputFile);
	WriteBufferToFile("I:\\TP\\ImageData.bin", outputFile);



	SWCTHeader header;
	CDynMemoryBuffer block0, block1;
	{
	CDynMemoryReader r(&textureFile);
	r.Get(header);
	block0.Resize(header.BlockSize0);
	block1.Resize(header.BlockSize1);

	r.GetRaw(block0.GetBytePtr(), header.BlockSize0);
	r.GetRaw(block1.GetBytePtr(), header.BlockSize1);
	}



	uint32_t alignedWidth = GetAlignedSize(header.Width);
	uint32_t alignedHeight = GetAlignedSize(header.Height);

	uint32_t maxSize = alignedWidth < alignedHeight ? alignedHeight : alignedWidth;
	uint32_t halfMaxSize = maxSize / 2;


	const float float8 = 1.0f - ((float)header.YChannelQuantizationScale * -0.5f);
	const float floatA = 1.0f - ((float)header.CbChannelQuantizationScale * -0.25f);
	const float floatC = 1.0f - ((float)header.CrChannelQuantizationScale * -0.25f);
	const float floatE = (float)header.AChannelQuantizationScale + 1.0f;


	CTPWBitReader br(block0);

	static uint8_t decodeBuffer[0x60000];
	uint8_t* pDest = decodeBuffer;

	for (;;){
	uint16_t isDelta = br.GetBits(1);
	//One byte literal
	if (isDelta == 0){
	uint8_t literalValue = (uint8_t)br.GetBits(8);
	LogFormatedA("Literal %.2X\r\n", literalValue);
	*(pDest++) = literalValue;
	} else {
	size_t offset = br.GetBits(12);
	size_t length = br.GetBits(7) + 1;

	if (offset == 0){
	LogFormatedA("End of stream\r\n");
	break;
	}

	LogFormatedA("Delta -%zu[%zu]\r\n", offset, length);

	size_t i;
	const uint8_t* pSrc = pDest - offset;
	for (i = 0; i < length; ++i){
	(pDest++) = (pSrc++);
	}

	//memmove(pDest, pDest - offset, length);
	//pDest += length;
	}
	}
	size_t len2 = pDest - decodeBuffer;


	static float s_FloatBuf0[0x10000];

	static float s_FloatBuf1[0x10000];

	uint32_t i, j;

	uint32_t count2 = halfMaxSize / 2 - 1;




	CDynMemoryBuffer dbuf;

	const int8_t* pDecodeSource = (const int8_t*)decodeBuffer;

	SImageDecodeState state;
	TDynArray<float> outputY, outputCb, outputCr, outputA;

	bool r = DecodeChannel(state, maxSize, pDecodeSource, (const int8_t*)decodeBuffer + len2,
	outputY, float8);
	bool g = DecodeChannel(state, maxSize/2, pDecodeSource, (const int8_t*)decodeBuffer + len2,
	outputCb, floatA);
	bool b = DecodeChannel(state, maxSize/2, pDecodeSource, (const int8_t*)decodeBuffer + len2,
	outputCr, floatC);


	TDynArray<float> output;


	output.Resize(header.Width * header.Height * 3);

	uint32_t x, y;
	for (y = 0; y < header.Height; ++y){
	for (x = 0; x < header.Width; ++x){
	float cy = outputY[y * maxSize + x];
	float cb = outputCb[((y / 2) * (maxSize / 2) + (x / 2))];
	float cr = outputCr[((y / 2) * (maxSize / 2) + (x / 2))];
	const float r = cy + 1.402f * (cr - 128.0f);
	const float g = cy - 0.344136f * (cb - 128.0f) - 0.714136f * (cr - 128.0f);
	const float b = cy + 1.772f * (cb - 128.0f);

	output[((y * header.Width + x) * 3)] = r;
	output[((y * header.Width + x) * 3)+1] = g;
	output[((y * header.Width + x) * 3)+2] = b;
	}
	}

	{
	FILE* f;
	fopen_s(&f, "I:\\TP\\ImageData.bin", "wb");
	fwrite(output.begin(), sizeof(float), output.Size(), f);
	fclose(f);
	}



	i = 0;

	/*{
	float float8_s0 = float8 * g_Scale0;
	float float8_s1 = float8 * g_Scale1;
	float float8_s2 = float8 * g_Scale2;
	float float8_s3 = float8 * g_Scale3;

	const int8_t* pSrc = (const int8_t*)decodeBuffer;
	CVec2f* pDest = s_FloatBuf0;
	for (i = 0; i < maxSize; ++i){

	const float val0f = FetchFloat(pSrc);
	const float val1f = FetchFloat(pSrc);



	CVec2f* pDest2 = pDest + 1;
	for (j = 0; j < count2; ++j){
	const float val0f = FetchFloat(pSrc);
	const float val1f = FetchFloat(pSrc);

	*(pDest2++) = CVec2f(
	val0f * float8_s2 + val1f * float8_s1 + val0f * float8_s0 + val1f * float8_s3,
	val0f * float8_s3 + val0f * float8_s1 - val1f * float8_s0 - val1f * float8_s2
	);
	}
	*pDest = CVec2f(
	val0f * float8_s2 + val1f * float8_s1 + val0f * float8_s0 + val1f * float8_s3,
	val0f * float8_s3 + val0f * float8_s1 - val1f * float8_s0 - val1f * float8_s2
	);



	pDest += maxSize/2;
	}
	}*/
	{
	float* pDest = s_FloatBuf1;

	}

	size_t len = pDest - decodeBuffer;


	uint32_t width = 63;

	uint32_t value = (width - 1);
	/* uint32_t n = 1;
	for (; (value >>= 1) != 0;){
	++n;
	}*/

	return 0;
	}