aras-p/D3D9ByteCode.cpp

## D3D9ByteCode.cpp
#include "UnityPrefix.h"
#include "D3D9ByteCode.h"

// D3D9 shader bytecode format on MSDN: https://msdn.microsoft.com/en-us/library/windows/hardware/ff552891.aspx

const UInt32 kD3D9ShaderTypeVertex = 0xFFFE0000;

const UInt32 kD3D9SwizzleShift = 16;
const UInt32 kD3D9NoSwizzle = ((0 << kD3D9SwizzleShift) | (1 << (kD3D9SwizzleShift + 2)) | (2 << (kD3D9SwizzleShift + 4)) | (3 << (kD3D9SwizzleShift + 6)));

const UInt32 kD3D9WriteMaskX = 0x00010000;
const UInt32 kD3D9WriteMaskY = 0x00020000;
const UInt32 kD3D9WriteMaskZ = 0x00040000;
const UInt32 kD3D9WriteMaskW = 0x00080000;


enum D3D9Opcode
{
	kD3D9Op_NOP = 0,
	kD3D9Op_MOV,
	kD3D9Op_ADD,
	kD3D9Op_SUB,
	kD3D9Op_MAD,
	kD3D9Op_MUL,
	kD3D9Op_RCP,
	kD3D9Op_RSQ,
	kD3D9Op_DP3,
	kD3D9Op_DP4,
	kD3D9Op_MIN,
	kD3D9Op_MAX,
	kD3D9Op_SLT,
	kD3D9Op_SGE,
	kD3D9Op_EXP,
	kD3D9Op_LOG,
	kD3D9Op_LIT,
	kD3D9Op_DST,
	kD3D9Op_LRP,
	kD3D9Op_FRC,
	kD3D9Op_M4x4,
	kD3D9Op_M4x3,
	kD3D9Op_M3x4,
	kD3D9Op_M3x3,
	kD3D9Op_M3x2,
	kD3D9Op_CALL,
	kD3D9Op_CALLNZ,
	kD3D9Op_LOOP,
	kD3D9Op_RET,
	kD3D9Op_ENDLOOP,
	kD3D9Op_LABEL,
	kD3D9Op_DCL,
	kD3D9Op_POW,
	kD3D9Op_CRS,
	kD3D9Op_SGN,
	kD3D9Op_ABS,
	kD3D9Op_NRM,
	kD3D9Op_SINCOS,
	kD3D9Op_REP,
	kD3D9Op_ENDREP,
	kD3D9Op_IF,
	kD3D9Op_IFC,
	kD3D9Op_ELSE,
	kD3D9Op_ENDIF,
	kD3D9Op_BREAK,
	kD3D9Op_BREAKC,
	kD3D9Op_MOVA,
	kD3D9Op_DEFB,
	kD3D9Op_DEFI,

	kD3D9Op_TEXCOORD = 64,
	kD3D9Op_TEXKILL,
	kD3D9Op_TEX,
	kD3D9Op_TEXBEM,
	kD3D9Op_TEXBEML,
	kD3D9Op_TEXREG2AR,
	kD3D9Op_TEXREG2GB,
	kD3D9Op_TEXM3x2PAD,
	kD3D9Op_TEXM3x2TEX,
	kD3D9Op_TEXM3x3PAD,
	kD3D9Op_TEXM3x3TEX,
	kD3D9Op_RESERVED0,
	kD3D9Op_TEXM3x3SPEC,
	kD3D9Op_TEXM3x3VSPEC,
	kD3D9Op_EXPP,
	kD3D9Op_LOGP,
	kD3D9Op_CND,
	kD3D9Op_DEF,
	kD3D9Op_TEXREG2RGB,
	kD3D9Op_TEXDP3TEX,
	kD3D9Op_TEXM3x2DEPTH,
	kD3D9Op_TEXDP3,
	kD3D9Op_TEXM3x3,
	kD3D9Op_TEXDEPTH,
	kD3D9Op_CMP,
	kD3D9Op_BEM,
	kD3D9Op_DP2ADD,
	kD3D9Op_DSX,
	kD3D9Op_DSY,
	kD3D9Op_TEXLDD,
	kD3D9Op_SETP,
	kD3D9Op_TEXLDL,
	kD3D9Op_BREAKP,

	kD3D9Op_PHASE = 0xFFFD,
	kD3D9Op_COMMENT = 0xFFFE,
	kD3D9Op_END = 0xFFFF,
};


enum D3D9Register
{
	kD3D9Reg_NONE = -1,
	kD3D9Reg_TEMP = 0,
	kD3D9Reg_INPUT = 1,
	kD3D9Reg_CONST = 2,
	kD3D9Reg_ADDR_or_TEXTURE = 3,
	kD3D9Reg_RASTOUT = 4,
	kD3D9Reg_ATTROUT = 5,
	kD3D9Reg_OUTPUT = 6,
	kD3D9Reg_CONST_INT = 7,
	kD3D9Reg_COLOROUT = 8,
	kD3D9Reg_DEPTHOUT = 9,
	kD3D9Reg_SAMPLER = 10,
	kD3D9Reg_CONST2 = 11, // constants 2048..4095
	kD3D9Reg_CONST3 = 12, // constants 4096..6143
	kD3D9Reg_CONST4 = 13, // constants 6144..8191
	kD3D9Reg_CONST_BOOL = 14,
	kD3D9Reg_LOOP = 15,
	kD3D9Reg_TEMPFLOAT16 = 16, // temp for half-precision floats
	kD3D9Reg_MISC = 17,
	kD3D9Reg_LABEL = 18, // label pseudo-register
	kD3D9Reg_PREDICATE = 19,
};


static void DecodeShaderVersionD3D9(UInt32 token, UInt32* outType, UInt32* outMajor, UInt32* outMinor)
{
	*outType = token & 0xFFFF0000;
	*outMajor = (token >> 8) & 0xFF;
	*outMinor = token & 0xFF;
}

static D3D9Opcode DecodeOpcode(UInt32 token)
{
	return (D3D9Opcode)(token & 0x0000FFFF);
}

static UInt32 DecodeInstructionLength(UInt32 token)
{
	return (token & 0x0F000000) >> 24;
}

static UInt32 DecodeCommentLength(UInt32 token)
{
	return (token & 0x7FFF0000) >> 16;
}

static UInt32 DecodeRegisterIndex(UInt32 token)
{
	return token & 0x7FF;
}

static UInt32 EncodeRegisterIndex(int index)
{
	return index & 0x7FF;
}

static D3D9Register DecodeRegisterType(UInt32 token)
{
	return (D3D9Register)(((token & 0x70000000) >> 28) | ((token & 0x00001800) >> 8));
}

static UInt32 EncodeRegisterType(D3D9Register type)
{
	return ((type & 7) << 28) | ((type & 0x18) << 8);
}

static UInt32 EncodeReplicateSwizzle(UInt32 comp)
{
	return (comp << kD3D9SwizzleShift) | (comp << (kD3D9SwizzleShift + 2)) | (comp << (kD3D9SwizzleShift + 4)) | (comp << (kD3D9SwizzleShift + 6));
}


static bool NextToken(const D3D9ShaderByteCode& byteCode, size_t& inOutIndex)
{
	if (inOutIndex >= byteCode.size())
		return false;
	const UInt32 token = byteCode[inOutIndex];
	const D3D9Opcode op = DecodeOpcode(token);
	UInt32 length = DecodeInstructionLength(token);
	// comment instructions have different length encoding
	if (op == kD3D9Op_COMMENT)
		length = DecodeCommentLength(token);
	inOutIndex += length + 1;
	if (op == kD3D9Op_END)
		return false;
	return true;
}

// "regular" instructions have destination + source registers right after instruction token
static bool IsRegularInstruction(D3D9Opcode op)
{
	if (op == kD3D9Op_END || op == kD3D9Op_COMMENT || op == kD3D9Op_DCL || op == kD3D9Op_DEF || op == kD3D9Op_DEFI || op == kD3D9Op_DEFB)
		return false;
	return true;
}


static int FindUnusedTempRegisterD3D9(const D3D9ShaderByteCode& byteCode)
{
	size_t index = 1;
	// Find max used temporary register slot.
	// HLSL compiler does fairly tight temporary register allocation,
	// so we'll go with "max used + 1" as the "free register".
	int maxUsed = -1;
	do
	{
		const D3D9Opcode op = DecodeOpcode(byteCode[index]);
		if (IsRegularInstruction(op))
		{
			const UInt32 length = DecodeInstructionLength(byteCode[index]);
			for (UInt32 i = 0; i < length; ++i)
			{
				UInt32 token = byteCode[index + 1 + i];
				D3D9Register type = DecodeRegisterType(token);
				if (type == kD3D9Reg_TEMP)
				{
					int regIndex = DecodeRegisterIndex(token);
					if (regIndex > maxUsed)
						maxUsed = regIndex;
				}
			}
		}

	} while (NextToken(byteCode, index));
	return maxUsed + 1;
}

// Finds output position register index in VS3.0
static int FindPositionOutputRegisterD3D9(const D3D9ShaderByteCode& byteCode)
{
	size_t index = 1;
	do
	{
		const D3D9Opcode op = DecodeOpcode(byteCode[index]);
		if (op == kD3D9Op_DCL)
		{
			const UInt32 length = DecodeInstructionLength(byteCode[index]);
			if (length >= 2)
			{
				UInt32 token1 = byteCode[index + 1];
				UInt32 token2 = byteCode[index + 2];
				if (token1 == 0x80000000 && DecodeRegisterType(token2) == kD3D9Reg_OUTPUT)
				{
					return DecodeRegisterIndex(token2);
				}
			}
		}

	} while (NextToken(byteCode, index));

	return -1;
}


// Rewrites all usages of "src" register into "dst" one.
static void RewriteRegisterD3D9(D3D9ShaderByteCode& byteCode, D3D9Register srcType, int srcIndex, D3D9Register dstType, int dstIndex)
{
	size_t index = 1;
	do
	{
		const D3D9Opcode op = DecodeOpcode(byteCode[index]);
		if (IsRegularInstruction(op))
		{
			const UInt32 length = DecodeInstructionLength(byteCode[index]);
			for (UInt32 i = 0; i < length; ++i)
			{
				UInt32& token = byteCode[index + 1 + i];
				if (DecodeRegisterType(token) == srcType && DecodeRegisterIndex(token) == srcIndex)
				{
					token &= ~0x70001800; // clear register type
					token |= EncodeRegisterType(dstType);
					token &= ~0x000007FF; // clear register index
					token |= EncodeRegisterIndex(dstIndex);
				}
			}
		}

	} while (NextToken(byteCode, index));
}


static size_t FindEndOfShaderD3D9(const D3D9ShaderByteCode& byteCode)
{
	size_t index = 1;
	do
	{
		const D3D9Opcode op = DecodeOpcode(byteCode[index]);
		if (op == kD3D9Op_END)
			return index;
	} while (NextToken(byteCode, index));
	return index;
}


const char* PatchD3D9ShaderHalfPixelOffset(D3D9ShaderByteCode& byteCode, int constantWithFixupInfo)
{
	// sanity and version checks
	if (byteCode.empty())
		return "Got empty vertex shader";

	UInt32 shaderType, shaderVersionMajor, shaderVersionMinor;
	DecodeShaderVersionD3D9(byteCode[0], &shaderType, &shaderVersionMajor, &shaderVersionMinor);
	if (shaderType != kD3D9ShaderTypeVertex)
		return "Got a non-vertex shader";

	const bool isSM30 = (shaderVersionMajor == 3 && shaderVersionMinor == 0);
	const bool isSM20 = (shaderVersionMajor == 2 && shaderVersionMinor == 0);
	if (!isSM30 && !isSM20)
		return "Only supports SM2.0 and SM3.0 shaders";

	// overall process is:
	// 1) find unused temporary register
	// 2) replace all position writes to use the temporary register
	// 3) insert instruction at the end to output that temporary + fixup into position

	// Find vertex output register
	D3D9Register positionType = kD3D9Reg_RASTOUT;
	int positionIndex = 0;
	if (isSM30)
	{
		// in SM3.0, position is part of generic output registers, so find it from declarations
		positionType = kD3D9Reg_OUTPUT;
		positionIndex = FindPositionOutputRegisterD3D9(byteCode);
		if (positionIndex < 0)
		{
			return "Could not find SM3.0 vertex output register index";
		}
	}

	// Find temporary register to use
	int tempIndex = FindUnusedTempRegisterD3D9(byteCode);
	if (isSM30 && tempIndex >= 32)
		return "Out of temporary registers in SM3.0";
	if (isSM20 && tempIndex >= 12)
		return "Out of temporary registers in SM2.0";

	// Rewrite positions usages into the temp
	RewriteRegisterD3D9(byteCode, positionType, positionIndex, kD3D9Reg_TEMP, tempIndex);

	// Insert instruction to do fixup at the end
	size_t insertPos = FindEndOfShaderD3D9(byteCode);
	byteCode.insert(byteCode.begin() + insertPos, 8, 0); // 5 tokens for mad, 3 tokens for mov
	// mad oPos.xy, tmpPos.w, constFixup, tmpPos
	byteCode[insertPos + 0] = kD3D9Op_MAD + (4 << 24);
	byteCode[insertPos + 1] = EncodeRegisterIndex(positionIndex) | EncodeRegisterType(positionType) | kD3D9WriteMaskX | kD3D9WriteMaskY | 0x80000000; // oPos.xy
	byteCode[insertPos + 2] = EncodeRegisterIndex(tempIndex) | EncodeRegisterType(kD3D9Reg_TEMP) | EncodeReplicateSwizzle(3) | 0x80000000; // tmpPos.w
	byteCode[insertPos + 3] = EncodeRegisterIndex(constantWithFixupInfo) | EncodeRegisterType(kD3D9Reg_CONST) | kD3D9NoSwizzle | 0x80000000; // constFixup
	byteCode[insertPos + 4] = EncodeRegisterIndex(tempIndex) | EncodeRegisterType(kD3D9Reg_TEMP) | kD3D9NoSwizzle | 0x80000000; // tmpPos
	// mov oPos.zw, tmpPos
	byteCode[insertPos + 5] = kD3D9Op_MOV + (2 << 24);
	byteCode[insertPos + 6] = EncodeRegisterIndex(positionIndex) | EncodeRegisterType(positionType) | kD3D9WriteMaskZ | kD3D9WriteMaskW | 0x80000000; // oPos.zw;
	byteCode[insertPos + 7] = EncodeRegisterIndex(tempIndex) | EncodeRegisterType(kD3D9Reg_TEMP) | kD3D9NoSwizzle | 0x80000000; // tmpPos

	return NULL;
}

## D3D9ByteCode.h
#pragma once

// Utilities to modify D3D9 shader bytecode

#include <vector>

typedef std::vector<UInt32> D3D9ShaderByteCode;

// Modifies DX9 vertex shader bytecode to adjust clip space position, so that it matches
// DX11/GL rasterization. constantWithFixupInfo is constant register index that
// will contain screen size information.
//
// Returns NULL on success, or error message on failure.
//
// The basic idea is: we can make DX9 "half texel offset" thing be completely gone, by
// shifting XY components of clip space position in all vertex shaders by half a viewport
// pixel. This is actually what DX11 9.x feature level does behind the scenes (shader compiler
// inserts the fixup, and runtime supplies the shader constant). It's also done by WebGL ANGLE
// (see "The ANGLE Project: Implementing OpenGL ES 2.0 on Direct3D" article from OpenGL Insights
// book).
//
// So we do the same here: insert fixup code in all DX9 vertex shaders. GfxDeviceD3D9 will supply
// viewport info into the shader constant at runtime.
//
// Note, this assumes constantWithFixupInfo is not already used by the shader. We detect that from
// reflection information above this call.
const char* PatchD3D9ShaderHalfPixelOffset(D3D9ShaderByteCode& byteCode, int constantWithFixupInfo);

## D3D9ByteCodeTests.cpp
#include "UnityPrefix.h"

#if ENABLE_UNIT_TESTS

#include "D3D9ByteCode.h"
#include "Editor/Src/Utility/d3d11/D3D11Compiler.h"
#include "Runtime/Testing/Testing.h"
#include "../ShaderCompiler.h"

extern D3D11Compiler g_D3D11Compiler;


INTEGRATION_TEST_SUITE(D3D9ByteCodeTests)
{
	static std::string PatchHalfPixelOffsetVertexShaderAsm(const char* source, bool expectToFail = false)
	{
		HRESULT hr;
		// assemble source shader string into DX9 bytecode
		D3D10Blob* bytecodeBlob;
		hr = g_D3D11Compiler.D3DAssemble(source, strlen(source), 0, &bytecodeBlob, NULL);
		if (FAILED_IMPL(hr))
		{
			ErrorString("Failed to assemble source shader");
			return "";
		}
		const size_t bytecodeSize = bytecodeBlob->GetBufferSize();
		if (bytecodeSize % 4 != 0)
		{
			ErrorString("Assembled shader bytecode size should be multiple of 4");
			return "";
		}

		// get bytecode into a vector of tokens
		D3D9ShaderByteCode bytecode;
		bytecode.reserve(bytecodeSize / 4);
		const UInt8* ptr = (const UInt8*)bytecodeBlob->GetBufferPointer();
		for (size_t i = 0; i < bytecodeSize; i += 4)
		{
			bytecode.push_back(*(const UInt32*)(ptr + i));
		}
		bytecodeBlob->Release();

		// insert position fixup
		const char* msg = PatchD3D9ShaderHalfPixelOffset(bytecode, 255);
		if (expectToFail)
		{
			return msg;
		}

		// disassemble the shader
		D3D10Blob* disasm;
		hr = g_D3D11Compiler.D3DDisassemble(&bytecode[0], bytecode.size()*4, 0, NULL, &disasm);
		if (FAILED_IMPL(hr))
		{
			ErrorString("Failed to disassemble patched shader");
			return "";
		}
		std::string res = (const char*)disasm->GetBufferPointer();
		disasm->Release();

		// remove comments, indentation, trim
		RemoveCommentsFromAsm(res);
		res = DeindentAsmCode(res);
		res = Trim(res, " \r\n\t");

		return res;
	}

	TEST(Trivial_VS20_Works)
	{
		const char* src =
			"vs_2_0\n"
			"def c0, 1, 0, 0, 0\n"
			"mov oPos, c0.x\n";
		std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
		std::string exp =
			"vs_2_0\n"
			"def c0, 1, 0, 0, 0\n"
			"mov r0, c0.x\n"
			"mad oPos.xy, r0.w, c255, r0\n"
			"mov oPos.zw, r0";
		CHECK_EQUAL(exp, res);
	}

	TEST(Trivial_VS30_Works)
	{
		const char* src =
			"vs_3_0\n"
			"def c0, 1, 0, 0, 0\n"
			"dcl_position o0\n"
			"mov o0, c0.x\n";
		std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
		std::string exp =
			"vs_3_0\n"
			"def c0, 1, 0, 0, 0\n"
			"dcl_position o0\n"
			"mov r0, c0.x\n"
			"mad o0.xy, r0.w, c255, r0\n"
			"mov o0.zw, r0";
		CHECK_EQUAL(exp, res);
	}

	TEST(Patching_Handles_Comments_Correctly)
	{
		const char* src =
			"// one comment there\n"
			"vs_3_0\n"
			"// another comment right here\n"
			"def c0, 1, 0, 0, 0\n"
			"// another comment here, trying to make it fairly long to see if we correctly decode the length field. Still making it long...\n"
			"dcl_position o0\n"
			"// what do you know, another comment\n"
			"mov o0, c0.x\n"
			"// perhaps unexpectedly, a comment at the end!\n";
		PatchHalfPixelOffsetVertexShaderAsm(src);
	}

	TEST(NonTrivial_VS20_Works)
	{
		const char* src =
			"vs_2_0\n"
			"dcl_position v0\n"
			"dcl_texcoord v1\n"
			"pow r0.x, v1.x, v1.y\n"
			"mul r0.xy, r0.x, v1\n"
			"add oT0.xy, r0.y, r0.x\n"
			"add oT1.xyz, -v0, c4\n"
			"mul oD0, v0, c4\n"
			"dp4 oPos.x, v0, c0\n"
			"dp4 oPos.y, v0, c1\n"
			"dp4 oPos.z, v0, c2\n"
			"dp4 oPos.w, v0, c3\n";
		std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
		std::string exp =
			"vs_2_0\n"
			"dcl_position v0\n"
			"dcl_texcoord v1\n"
			"pow r0.x, v1.x, v1.y\n"
			"mul r0.xy, r0.x, v1\n"
			"add oT0.xy, r0.y, r0.x\n"
			"add oT1.xyz, -v0, c4\n"
			"mul oD0, v0, c4\n"
			"dp4 r1.x, v0, c0\n"
			"dp4 r1.y, v0, c1\n"
			"dp4 r1.z, v0, c2\n"
			"dp4 r1.w, v0, c3\n"
			"mad oPos.xy, r1.w, c255, r1\n"
			"mov oPos.zw, r1";
		CHECK_EQUAL(exp, res);
	}

	TEST(NonTrivial_VS30_Works)
	{
		const char* src =
			"vs_3_0\n"
			"dcl_position v0\n"
			"dcl_texcoord v1\n"
			"dcl_texcoord o0.xy\n"
			"dcl_texcoord1 o1.xyz\n"
			"dcl_color o2\n"
			"dcl_position o3\n"
			"pow r0.x, v1.x, v1.y\n"
			"mul r0.xy, r0.x, v1\n"
			"add o0.xy, r0.y, r0.x\n"
			"add o1.xyz, c4, -v0\n"
			"mul o2, c4, v0\n"
			"dp4 o3.x, v0, c0\n"
			"dp4 o3.y, v0, c1\n"
			"dp4 o3.z, v0, c2\n"
			"dp4 o3.w, v0, c3\n";
		std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
		std::string exp =
			"vs_3_0\n"
			"dcl_position v0\n"
			"dcl_texcoord v1\n"
			"dcl_texcoord o0.xy\n"
			"dcl_texcoord1 o1.xyz\n"
			"dcl_color o2\n"
			"dcl_position o3\n"
			"pow r0.x, v1.x, v1.y\n"
			"mul r0.xy, r0.x, v1\n"
			"add o0.xy, r0.y, r0.x\n"
			"add o1.xyz, c4, -v0\n"
			"mul o2, c4, v0\n"
			"dp4 r1.x, v0, c0\n"
			"dp4 r1.y, v0, c1\n"
			"dp4 r1.z, v0, c2\n"
			"dp4 r1.w, v0, c3\n"
			"mad o3.xy, r1.w, c255, r1\n"
			"mov o3.zw, r1";
		CHECK_EQUAL(exp, res);
	}

	// tests for error conditions

	TEST(Error_PS20_Input_IsRejected)
	{
		const char* src =
			"ps_2_0\n"
			"mov oC0, c0\n";
		std::string msg = PatchHalfPixelOffsetVertexShaderAsm(src, true);
		CHECK_MSG(!msg.empty(), "Error message should be returned");
	}

	TEST(Error_VS20_With12TempRegisterUsed_IsRejected)
	{
		const char* src =
			"vs_2_0\n"
			"mov r11, c0\n" // we only scan for highest used register index, this will make us think all 12 temps in VS2.0 are used
			"mov oPos, r11\n";
		std::string msg = PatchHalfPixelOffsetVertexShaderAsm(src, true);
		CHECK_MSG(!msg.empty(), "Error message should be returned");
	}


} // UNIT_TEST_SUITE(D3D9ByteCodeTests)

#endif // ENABLE_UNIT_TESTS
	#include "UnityPrefix.h"
	#include "D3D9ByteCode.h"

	// D3D9 shader bytecode format on MSDN: https://msdn.microsoft.com/en-us/library/windows/hardware/ff552891.aspx

	const UInt32 kD3D9ShaderTypeVertex = 0xFFFE0000;

	const UInt32 kD3D9SwizzleShift = 16;
	const UInt32 kD3D9NoSwizzle = ((0 << kD3D9SwizzleShift) \| (1 << (kD3D9SwizzleShift + 2)) \| (2 << (kD3D9SwizzleShift + 4)) \| (3 << (kD3D9SwizzleShift + 6)));

	const UInt32 kD3D9WriteMaskX = 0x00010000;
	const UInt32 kD3D9WriteMaskY = 0x00020000;
	const UInt32 kD3D9WriteMaskZ = 0x00040000;
	const UInt32 kD3D9WriteMaskW = 0x00080000;


	enum D3D9Opcode
	{
	kD3D9Op_NOP = 0,
	kD3D9Op_MOV,
	kD3D9Op_ADD,
	kD3D9Op_SUB,
	kD3D9Op_MAD,
	kD3D9Op_MUL,
	kD3D9Op_RCP,
	kD3D9Op_RSQ,
	kD3D9Op_DP3,
	kD3D9Op_DP4,
	kD3D9Op_MIN,
	kD3D9Op_MAX,
	kD3D9Op_SLT,
	kD3D9Op_SGE,
	kD3D9Op_EXP,
	kD3D9Op_LOG,
	kD3D9Op_LIT,
	kD3D9Op_DST,
	kD3D9Op_LRP,
	kD3D9Op_FRC,
	kD3D9Op_M4x4,
	kD3D9Op_M4x3,
	kD3D9Op_M3x4,
	kD3D9Op_M3x3,
	kD3D9Op_M3x2,
	kD3D9Op_CALL,
	kD3D9Op_CALLNZ,
	kD3D9Op_LOOP,
	kD3D9Op_RET,
	kD3D9Op_ENDLOOP,
	kD3D9Op_LABEL,
	kD3D9Op_DCL,
	kD3D9Op_POW,
	kD3D9Op_CRS,
	kD3D9Op_SGN,
	kD3D9Op_ABS,
	kD3D9Op_NRM,
	kD3D9Op_SINCOS,
	kD3D9Op_REP,
	kD3D9Op_ENDREP,
	kD3D9Op_IF,
	kD3D9Op_IFC,
	kD3D9Op_ELSE,
	kD3D9Op_ENDIF,
	kD3D9Op_BREAK,
	kD3D9Op_BREAKC,
	kD3D9Op_MOVA,
	kD3D9Op_DEFB,
	kD3D9Op_DEFI,

	kD3D9Op_TEXCOORD = 64,
	kD3D9Op_TEXKILL,
	kD3D9Op_TEX,
	kD3D9Op_TEXBEM,
	kD3D9Op_TEXBEML,
	kD3D9Op_TEXREG2AR,
	kD3D9Op_TEXREG2GB,
	kD3D9Op_TEXM3x2PAD,
	kD3D9Op_TEXM3x2TEX,
	kD3D9Op_TEXM3x3PAD,
	kD3D9Op_TEXM3x3TEX,
	kD3D9Op_RESERVED0,
	kD3D9Op_TEXM3x3SPEC,
	kD3D9Op_TEXM3x3VSPEC,
	kD3D9Op_EXPP,
	kD3D9Op_LOGP,
	kD3D9Op_CND,
	kD3D9Op_DEF,
	kD3D9Op_TEXREG2RGB,
	kD3D9Op_TEXDP3TEX,
	kD3D9Op_TEXM3x2DEPTH,
	kD3D9Op_TEXDP3,
	kD3D9Op_TEXM3x3,
	kD3D9Op_TEXDEPTH,
	kD3D9Op_CMP,
	kD3D9Op_BEM,
	kD3D9Op_DP2ADD,
	kD3D9Op_DSX,
	kD3D9Op_DSY,
	kD3D9Op_TEXLDD,
	kD3D9Op_SETP,
	kD3D9Op_TEXLDL,
	kD3D9Op_BREAKP,

	kD3D9Op_PHASE = 0xFFFD,
	kD3D9Op_COMMENT = 0xFFFE,
	kD3D9Op_END = 0xFFFF,
	};


	enum D3D9Register
	{
	kD3D9Reg_NONE = -1,
	kD3D9Reg_TEMP = 0,
	kD3D9Reg_INPUT = 1,
	kD3D9Reg_CONST = 2,
	kD3D9Reg_ADDR_or_TEXTURE = 3,
	kD3D9Reg_RASTOUT = 4,
	kD3D9Reg_ATTROUT = 5,
	kD3D9Reg_OUTPUT = 6,
	kD3D9Reg_CONST_INT = 7,
	kD3D9Reg_COLOROUT = 8,
	kD3D9Reg_DEPTHOUT = 9,
	kD3D9Reg_SAMPLER = 10,
	kD3D9Reg_CONST2 = 11, // constants 2048..4095
	kD3D9Reg_CONST3 = 12, // constants 4096..6143
	kD3D9Reg_CONST4 = 13, // constants 6144..8191
	kD3D9Reg_CONST_BOOL = 14,
	kD3D9Reg_LOOP = 15,
	kD3D9Reg_TEMPFLOAT16 = 16, // temp for half-precision floats
	kD3D9Reg_MISC = 17,
	kD3D9Reg_LABEL = 18, // label pseudo-register
	kD3D9Reg_PREDICATE = 19,
	};


	static void DecodeShaderVersionD3D9(UInt32 token, UInt32* outType, UInt32* outMajor, UInt32* outMinor)
	{
	*outType = token & 0xFFFF0000;
	*outMajor = (token >> 8) & 0xFF;
	*outMinor = token & 0xFF;
	}

	static D3D9Opcode DecodeOpcode(UInt32 token)
	{
	return (D3D9Opcode)(token & 0x0000FFFF);
	}

	static UInt32 DecodeInstructionLength(UInt32 token)
	{
	return (token & 0x0F000000) >> 24;
	}

	static UInt32 DecodeCommentLength(UInt32 token)
	{
	return (token & 0x7FFF0000) >> 16;
	}

	static UInt32 DecodeRegisterIndex(UInt32 token)
	{
	return token & 0x7FF;
	}

	static UInt32 EncodeRegisterIndex(int index)
	{
	return index & 0x7FF;
	}

	static D3D9Register DecodeRegisterType(UInt32 token)
	{
	return (D3D9Register)(((token & 0x70000000) >> 28) \| ((token & 0x00001800) >> 8));
	}

	static UInt32 EncodeRegisterType(D3D9Register type)
	{
	return ((type & 7) << 28) \| ((type & 0x18) << 8);
	}

	static UInt32 EncodeReplicateSwizzle(UInt32 comp)
	{
	return (comp << kD3D9SwizzleShift) \| (comp << (kD3D9SwizzleShift + 2)) \| (comp << (kD3D9SwizzleShift + 4)) \| (comp << (kD3D9SwizzleShift + 6));
	}


	static bool NextToken(const D3D9ShaderByteCode& byteCode, size_t& inOutIndex)
	{
	if (inOutIndex >= byteCode.size())
	return false;
	const UInt32 token = byteCode[inOutIndex];
	const D3D9Opcode op = DecodeOpcode(token);
	UInt32 length = DecodeInstructionLength(token);
	// comment instructions have different length encoding
	if (op == kD3D9Op_COMMENT)
	length = DecodeCommentLength(token);
	inOutIndex += length + 1;
	if (op == kD3D9Op_END)
	return false;
	return true;
	}

	// "regular" instructions have destination + source registers right after instruction token
	static bool IsRegularInstruction(D3D9Opcode op)
	{
	if (op == kD3D9Op_END \|\| op == kD3D9Op_COMMENT \|\| op == kD3D9Op_DCL \|\| op == kD3D9Op_DEF \|\| op == kD3D9Op_DEFI \|\| op == kD3D9Op_DEFB)
	return false;
	return true;
	}


	static int FindUnusedTempRegisterD3D9(const D3D9ShaderByteCode& byteCode)
	{
	size_t index = 1;
	// Find max used temporary register slot.
	// HLSL compiler does fairly tight temporary register allocation,
	// so we'll go with "max used + 1" as the "free register".
	int maxUsed = -1;
	do
	{
	const D3D9Opcode op = DecodeOpcode(byteCode[index]);
	if (IsRegularInstruction(op))
	{
	const UInt32 length = DecodeInstructionLength(byteCode[index]);
	for (UInt32 i = 0; i < length; ++i)
	{
	UInt32 token = byteCode[index + 1 + i];
	D3D9Register type = DecodeRegisterType(token);
	if (type == kD3D9Reg_TEMP)
	{
	int regIndex = DecodeRegisterIndex(token);
	if (regIndex > maxUsed)
	maxUsed = regIndex;
	}
	}
	}

	} while (NextToken(byteCode, index));
	return maxUsed + 1;
	}

	// Finds output position register index in VS3.0
	static int FindPositionOutputRegisterD3D9(const D3D9ShaderByteCode& byteCode)
	{
	size_t index = 1;
	do
	{
	const D3D9Opcode op = DecodeOpcode(byteCode[index]);
	if (op == kD3D9Op_DCL)
	{
	const UInt32 length = DecodeInstructionLength(byteCode[index]);
	if (length >= 2)
	{
	UInt32 token1 = byteCode[index + 1];
	UInt32 token2 = byteCode[index + 2];
	if (token1 == 0x80000000 && DecodeRegisterType(token2) == kD3D9Reg_OUTPUT)
	{
	return DecodeRegisterIndex(token2);
	}
	}
	}

	} while (NextToken(byteCode, index));

	return -1;
	}


	// Rewrites all usages of "src" register into "dst" one.
	static void RewriteRegisterD3D9(D3D9ShaderByteCode& byteCode, D3D9Register srcType, int srcIndex, D3D9Register dstType, int dstIndex)
	{
	size_t index = 1;
	do
	{
	const D3D9Opcode op = DecodeOpcode(byteCode[index]);
	if (IsRegularInstruction(op))
	{
	const UInt32 length = DecodeInstructionLength(byteCode[index]);
	for (UInt32 i = 0; i < length; ++i)
	{
	UInt32& token = byteCode[index + 1 + i];
	if (DecodeRegisterType(token) == srcType && DecodeRegisterIndex(token) == srcIndex)
	{
	token &= ~0x70001800; // clear register type
	token \|= EncodeRegisterType(dstType);
	token &= ~0x000007FF; // clear register index
	token \|= EncodeRegisterIndex(dstIndex);
	}
	}
	}

	} while (NextToken(byteCode, index));
	}


	static size_t FindEndOfShaderD3D9(const D3D9ShaderByteCode& byteCode)
	{
	size_t index = 1;
	do
	{
	const D3D9Opcode op = DecodeOpcode(byteCode[index]);
	if (op == kD3D9Op_END)
	return index;
	} while (NextToken(byteCode, index));
	return index;
	}


	const char* PatchD3D9ShaderHalfPixelOffset(D3D9ShaderByteCode& byteCode, int constantWithFixupInfo)
	{
	// sanity and version checks
	if (byteCode.empty())
	return "Got empty vertex shader";

	UInt32 shaderType, shaderVersionMajor, shaderVersionMinor;
	DecodeShaderVersionD3D9(byteCode[0], &shaderType, &shaderVersionMajor, &shaderVersionMinor);
	if (shaderType != kD3D9ShaderTypeVertex)
	return "Got a non-vertex shader";

	const bool isSM30 = (shaderVersionMajor == 3 && shaderVersionMinor == 0);
	const bool isSM20 = (shaderVersionMajor == 2 && shaderVersionMinor == 0);
	if (!isSM30 && !isSM20)
	return "Only supports SM2.0 and SM3.0 shaders";

	// overall process is:
	// 1) find unused temporary register
	// 2) replace all position writes to use the temporary register
	// 3) insert instruction at the end to output that temporary + fixup into position

	// Find vertex output register
	D3D9Register positionType = kD3D9Reg_RASTOUT;
	int positionIndex = 0;
	if (isSM30)
	{
	// in SM3.0, position is part of generic output registers, so find it from declarations
	positionType = kD3D9Reg_OUTPUT;
	positionIndex = FindPositionOutputRegisterD3D9(byteCode);
	if (positionIndex < 0)
	{
	return "Could not find SM3.0 vertex output register index";
	}
	}

	// Find temporary register to use
	int tempIndex = FindUnusedTempRegisterD3D9(byteCode);
	if (isSM30 && tempIndex >= 32)
	return "Out of temporary registers in SM3.0";
	if (isSM20 && tempIndex >= 12)
	return "Out of temporary registers in SM2.0";

	// Rewrite positions usages into the temp
	RewriteRegisterD3D9(byteCode, positionType, positionIndex, kD3D9Reg_TEMP, tempIndex);

	// Insert instruction to do fixup at the end
	size_t insertPos = FindEndOfShaderD3D9(byteCode);
	byteCode.insert(byteCode.begin() + insertPos, 8, 0); // 5 tokens for mad, 3 tokens for mov
	// mad oPos.xy, tmpPos.w, constFixup, tmpPos
	byteCode[insertPos + 0] = kD3D9Op_MAD + (4 << 24);
	byteCode[insertPos + 1] = EncodeRegisterIndex(positionIndex) \| EncodeRegisterType(positionType) \| kD3D9WriteMaskX \| kD3D9WriteMaskY \| 0x80000000; // oPos.xy
	byteCode[insertPos + 2] = EncodeRegisterIndex(tempIndex) \| EncodeRegisterType(kD3D9Reg_TEMP) \| EncodeReplicateSwizzle(3) \| 0x80000000; // tmpPos.w
	byteCode[insertPos + 3] = EncodeRegisterIndex(constantWithFixupInfo) \| EncodeRegisterType(kD3D9Reg_CONST) \| kD3D9NoSwizzle \| 0x80000000; // constFixup
	byteCode[insertPos + 4] = EncodeRegisterIndex(tempIndex) \| EncodeRegisterType(kD3D9Reg_TEMP) \| kD3D9NoSwizzle \| 0x80000000; // tmpPos
	// mov oPos.zw, tmpPos
	byteCode[insertPos + 5] = kD3D9Op_MOV + (2 << 24);
	byteCode[insertPos + 6] = EncodeRegisterIndex(positionIndex) \| EncodeRegisterType(positionType) \| kD3D9WriteMaskZ \| kD3D9WriteMaskW \| 0x80000000; // oPos.zw;
	byteCode[insertPos + 7] = EncodeRegisterIndex(tempIndex) \| EncodeRegisterType(kD3D9Reg_TEMP) \| kD3D9NoSwizzle \| 0x80000000; // tmpPos

	return NULL;
	}
	#pragma once

	// Utilities to modify D3D9 shader bytecode

	#include <vector>

	typedef std::vector<UInt32> D3D9ShaderByteCode;

	// Modifies DX9 vertex shader bytecode to adjust clip space position, so that it matches
	// DX11/GL rasterization. constantWithFixupInfo is constant register index that
	// will contain screen size information.
	//
	// Returns NULL on success, or error message on failure.
	//
	// The basic idea is: we can make DX9 "half texel offset" thing be completely gone, by
	// shifting XY components of clip space position in all vertex shaders by half a viewport
	// pixel. This is actually what DX11 9.x feature level does behind the scenes (shader compiler
	// inserts the fixup, and runtime supplies the shader constant). It's also done by WebGL ANGLE
	// (see "The ANGLE Project: Implementing OpenGL ES 2.0 on Direct3D" article from OpenGL Insights
	// book).
	//
	// So we do the same here: insert fixup code in all DX9 vertex shaders. GfxDeviceD3D9 will supply
	// viewport info into the shader constant at runtime.
	//
	// Note, this assumes constantWithFixupInfo is not already used by the shader. We detect that from
	// reflection information above this call.
	const char* PatchD3D9ShaderHalfPixelOffset(D3D9ShaderByteCode& byteCode, int constantWithFixupInfo);
	#include "UnityPrefix.h"

	#if ENABLE_UNIT_TESTS

	#include "D3D9ByteCode.h"
	#include "Editor/Src/Utility/d3d11/D3D11Compiler.h"
	#include "Runtime/Testing/Testing.h"
	#include "../ShaderCompiler.h"

	extern D3D11Compiler g_D3D11Compiler;


	INTEGRATION_TEST_SUITE(D3D9ByteCodeTests)
	{
	static std::string PatchHalfPixelOffsetVertexShaderAsm(const char* source, bool expectToFail = false)
	{
	HRESULT hr;
	// assemble source shader string into DX9 bytecode
	D3D10Blob* bytecodeBlob;
	hr = g_D3D11Compiler.D3DAssemble(source, strlen(source), 0, &bytecodeBlob, NULL);
	if (FAILED_IMPL(hr))
	{
	ErrorString("Failed to assemble source shader");
	return "";
	}
	const size_t bytecodeSize = bytecodeBlob->GetBufferSize();
	if (bytecodeSize % 4 != 0)
	{
	ErrorString("Assembled shader bytecode size should be multiple of 4");
	return "";
	}

	// get bytecode into a vector of tokens
	D3D9ShaderByteCode bytecode;
	bytecode.reserve(bytecodeSize / 4);
	const UInt8* ptr = (const UInt8*)bytecodeBlob->GetBufferPointer();
	for (size_t i = 0; i < bytecodeSize; i += 4)
	{
	bytecode.push_back((const UInt32)(ptr + i));
	}
	bytecodeBlob->Release();

	// insert position fixup
	const char* msg = PatchD3D9ShaderHalfPixelOffset(bytecode, 255);
	if (expectToFail)
	{
	return msg;
	}

	// disassemble the shader
	D3D10Blob* disasm;
	hr = g_D3D11Compiler.D3DDisassemble(&bytecode[0], bytecode.size()*4, 0, NULL, &disasm);
	if (FAILED_IMPL(hr))
	{
	ErrorString("Failed to disassemble patched shader");
	return "";
	}
	std::string res = (const char*)disasm->GetBufferPointer();
	disasm->Release();

	// remove comments, indentation, trim
	RemoveCommentsFromAsm(res);
	res = DeindentAsmCode(res);
	res = Trim(res, " \r\n\t");

	return res;
	}

	TEST(Trivial_VS20_Works)
	{
	const char* src =
	"vs_2_0\n"
	"def c0, 1, 0, 0, 0\n"
	"mov oPos, c0.x\n";
	std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
	std::string exp =
	"vs_2_0\n"
	"def c0, 1, 0, 0, 0\n"
	"mov r0, c0.x\n"
	"mad oPos.xy, r0.w, c255, r0\n"
	"mov oPos.zw, r0";
	CHECK_EQUAL(exp, res);
	}

	TEST(Trivial_VS30_Works)
	{
	const char* src =
	"vs_3_0\n"
	"def c0, 1, 0, 0, 0\n"
	"dcl_position o0\n"
	"mov o0, c0.x\n";
	std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
	std::string exp =
	"vs_3_0\n"
	"def c0, 1, 0, 0, 0\n"
	"dcl_position o0\n"
	"mov r0, c0.x\n"
	"mad o0.xy, r0.w, c255, r0\n"
	"mov o0.zw, r0";
	CHECK_EQUAL(exp, res);
	}

	TEST(Patching_Handles_Comments_Correctly)
	{
	const char* src =
	"// one comment there\n"
	"vs_3_0\n"
	"// another comment right here\n"
	"def c0, 1, 0, 0, 0\n"
	"// another comment here, trying to make it fairly long to see if we correctly decode the length field. Still making it long...\n"
	"dcl_position o0\n"
	"// what do you know, another comment\n"
	"mov o0, c0.x\n"
	"// perhaps unexpectedly, a comment at the end!\n";
	PatchHalfPixelOffsetVertexShaderAsm(src);
	}

	TEST(NonTrivial_VS20_Works)
	{
	const char* src =
	"vs_2_0\n"
	"dcl_position v0\n"
	"dcl_texcoord v1\n"
	"pow r0.x, v1.x, v1.y\n"
	"mul r0.xy, r0.x, v1\n"
	"add oT0.xy, r0.y, r0.x\n"
	"add oT1.xyz, -v0, c4\n"
	"mul oD0, v0, c4\n"
	"dp4 oPos.x, v0, c0\n"
	"dp4 oPos.y, v0, c1\n"
	"dp4 oPos.z, v0, c2\n"
	"dp4 oPos.w, v0, c3\n";
	std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
	std::string exp =
	"vs_2_0\n"
	"dcl_position v0\n"
	"dcl_texcoord v1\n"
	"pow r0.x, v1.x, v1.y\n"
	"mul r0.xy, r0.x, v1\n"
	"add oT0.xy, r0.y, r0.x\n"
	"add oT1.xyz, -v0, c4\n"
	"mul oD0, v0, c4\n"
	"dp4 r1.x, v0, c0\n"
	"dp4 r1.y, v0, c1\n"
	"dp4 r1.z, v0, c2\n"
	"dp4 r1.w, v0, c3\n"
	"mad oPos.xy, r1.w, c255, r1\n"
	"mov oPos.zw, r1";
	CHECK_EQUAL(exp, res);
	}

	TEST(NonTrivial_VS30_Works)
	{
	const char* src =
	"vs_3_0\n"
	"dcl_position v0\n"
	"dcl_texcoord v1\n"
	"dcl_texcoord o0.xy\n"
	"dcl_texcoord1 o1.xyz\n"
	"dcl_color o2\n"
	"dcl_position o3\n"
	"pow r0.x, v1.x, v1.y\n"
	"mul r0.xy, r0.x, v1\n"
	"add o0.xy, r0.y, r0.x\n"
	"add o1.xyz, c4, -v0\n"
	"mul o2, c4, v0\n"
	"dp4 o3.x, v0, c0\n"
	"dp4 o3.y, v0, c1\n"
	"dp4 o3.z, v0, c2\n"
	"dp4 o3.w, v0, c3\n";
	std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
	std::string exp =
	"vs_3_0\n"
	"dcl_position v0\n"
	"dcl_texcoord v1\n"
	"dcl_texcoord o0.xy\n"
	"dcl_texcoord1 o1.xyz\n"
	"dcl_color o2\n"
	"dcl_position o3\n"
	"pow r0.x, v1.x, v1.y\n"
	"mul r0.xy, r0.x, v1\n"
	"add o0.xy, r0.y, r0.x\n"
	"add o1.xyz, c4, -v0\n"
	"mul o2, c4, v0\n"
	"dp4 r1.x, v0, c0\n"
	"dp4 r1.y, v0, c1\n"
	"dp4 r1.z, v0, c2\n"
	"dp4 r1.w, v0, c3\n"
	"mad o3.xy, r1.w, c255, r1\n"
	"mov o3.zw, r1";
	CHECK_EQUAL(exp, res);
	}

	// tests for error conditions

	TEST(Error_PS20_Input_IsRejected)
	{
	const char* src =
	"ps_2_0\n"
	"mov oC0, c0\n";
	std::string msg = PatchHalfPixelOffsetVertexShaderAsm(src, true);
	CHECK_MSG(!msg.empty(), "Error message should be returned");
	}

	TEST(Error_VS20_With12TempRegisterUsed_IsRejected)
	{
	const char* src =
	"vs_2_0\n"
	"mov r11, c0\n" // we only scan for highest used register index, this will make us think all 12 temps in VS2.0 are used
	"mov oPos, r11\n";
	std::string msg = PatchHalfPixelOffsetVertexShaderAsm(src, true);
	CHECK_MSG(!msg.empty(), "Error message should be returned");
	}


	} // UNIT_TEST_SUITE(D3D9ByteCodeTests)

	#endif // ENABLE_UNIT_TESTS