Skip to content

Instantly share code, notes, and snippets.

@aras-p
Last active Apr 8, 2017
Embed
What would you like to do?
D3D9 Shader Bytecode Patching for Half-Pixel Fixup
#include "UnityPrefix.h"
#include "D3D9ByteCode.h"
// D3D9 shader bytecode format on MSDN: https://msdn.microsoft.com/en-us/library/windows/hardware/ff552891.aspx
const UInt32 kD3D9ShaderTypeVertex = 0xFFFE0000;
const UInt32 kD3D9SwizzleShift = 16;
const UInt32 kD3D9NoSwizzle = ((0 << kD3D9SwizzleShift) | (1 << (kD3D9SwizzleShift + 2)) | (2 << (kD3D9SwizzleShift + 4)) | (3 << (kD3D9SwizzleShift + 6)));
const UInt32 kD3D9WriteMaskX = 0x00010000;
const UInt32 kD3D9WriteMaskY = 0x00020000;
const UInt32 kD3D9WriteMaskZ = 0x00040000;
const UInt32 kD3D9WriteMaskW = 0x00080000;
enum D3D9Opcode
{
kD3D9Op_NOP = 0,
kD3D9Op_MOV,
kD3D9Op_ADD,
kD3D9Op_SUB,
kD3D9Op_MAD,
kD3D9Op_MUL,
kD3D9Op_RCP,
kD3D9Op_RSQ,
kD3D9Op_DP3,
kD3D9Op_DP4,
kD3D9Op_MIN,
kD3D9Op_MAX,
kD3D9Op_SLT,
kD3D9Op_SGE,
kD3D9Op_EXP,
kD3D9Op_LOG,
kD3D9Op_LIT,
kD3D9Op_DST,
kD3D9Op_LRP,
kD3D9Op_FRC,
kD3D9Op_M4x4,
kD3D9Op_M4x3,
kD3D9Op_M3x4,
kD3D9Op_M3x3,
kD3D9Op_M3x2,
kD3D9Op_CALL,
kD3D9Op_CALLNZ,
kD3D9Op_LOOP,
kD3D9Op_RET,
kD3D9Op_ENDLOOP,
kD3D9Op_LABEL,
kD3D9Op_DCL,
kD3D9Op_POW,
kD3D9Op_CRS,
kD3D9Op_SGN,
kD3D9Op_ABS,
kD3D9Op_NRM,
kD3D9Op_SINCOS,
kD3D9Op_REP,
kD3D9Op_ENDREP,
kD3D9Op_IF,
kD3D9Op_IFC,
kD3D9Op_ELSE,
kD3D9Op_ENDIF,
kD3D9Op_BREAK,
kD3D9Op_BREAKC,
kD3D9Op_MOVA,
kD3D9Op_DEFB,
kD3D9Op_DEFI,
kD3D9Op_TEXCOORD = 64,
kD3D9Op_TEXKILL,
kD3D9Op_TEX,
kD3D9Op_TEXBEM,
kD3D9Op_TEXBEML,
kD3D9Op_TEXREG2AR,
kD3D9Op_TEXREG2GB,
kD3D9Op_TEXM3x2PAD,
kD3D9Op_TEXM3x2TEX,
kD3D9Op_TEXM3x3PAD,
kD3D9Op_TEXM3x3TEX,
kD3D9Op_RESERVED0,
kD3D9Op_TEXM3x3SPEC,
kD3D9Op_TEXM3x3VSPEC,
kD3D9Op_EXPP,
kD3D9Op_LOGP,
kD3D9Op_CND,
kD3D9Op_DEF,
kD3D9Op_TEXREG2RGB,
kD3D9Op_TEXDP3TEX,
kD3D9Op_TEXM3x2DEPTH,
kD3D9Op_TEXDP3,
kD3D9Op_TEXM3x3,
kD3D9Op_TEXDEPTH,
kD3D9Op_CMP,
kD3D9Op_BEM,
kD3D9Op_DP2ADD,
kD3D9Op_DSX,
kD3D9Op_DSY,
kD3D9Op_TEXLDD,
kD3D9Op_SETP,
kD3D9Op_TEXLDL,
kD3D9Op_BREAKP,
kD3D9Op_PHASE = 0xFFFD,
kD3D9Op_COMMENT = 0xFFFE,
kD3D9Op_END = 0xFFFF,
};
enum D3D9Register
{
kD3D9Reg_NONE = -1,
kD3D9Reg_TEMP = 0,
kD3D9Reg_INPUT = 1,
kD3D9Reg_CONST = 2,
kD3D9Reg_ADDR_or_TEXTURE = 3,
kD3D9Reg_RASTOUT = 4,
kD3D9Reg_ATTROUT = 5,
kD3D9Reg_OUTPUT = 6,
kD3D9Reg_CONST_INT = 7,
kD3D9Reg_COLOROUT = 8,
kD3D9Reg_DEPTHOUT = 9,
kD3D9Reg_SAMPLER = 10,
kD3D9Reg_CONST2 = 11, // constants 2048..4095
kD3D9Reg_CONST3 = 12, // constants 4096..6143
kD3D9Reg_CONST4 = 13, // constants 6144..8191
kD3D9Reg_CONST_BOOL = 14,
kD3D9Reg_LOOP = 15,
kD3D9Reg_TEMPFLOAT16 = 16, // temp for half-precision floats
kD3D9Reg_MISC = 17,
kD3D9Reg_LABEL = 18, // label pseudo-register
kD3D9Reg_PREDICATE = 19,
};
static void DecodeShaderVersionD3D9(UInt32 token, UInt32* outType, UInt32* outMajor, UInt32* outMinor)
{
*outType = token & 0xFFFF0000;
*outMajor = (token >> 8) & 0xFF;
*outMinor = token & 0xFF;
}
static D3D9Opcode DecodeOpcode(UInt32 token)
{
return (D3D9Opcode)(token & 0x0000FFFF);
}
static UInt32 DecodeInstructionLength(UInt32 token)
{
return (token & 0x0F000000) >> 24;
}
static UInt32 DecodeCommentLength(UInt32 token)
{
return (token & 0x7FFF0000) >> 16;
}
static UInt32 DecodeRegisterIndex(UInt32 token)
{
return token & 0x7FF;
}
static UInt32 EncodeRegisterIndex(int index)
{
return index & 0x7FF;
}
static D3D9Register DecodeRegisterType(UInt32 token)
{
return (D3D9Register)(((token & 0x70000000) >> 28) | ((token & 0x00001800) >> 8));
}
static UInt32 EncodeRegisterType(D3D9Register type)
{
return ((type & 7) << 28) | ((type & 0x18) << 8);
}
static UInt32 EncodeReplicateSwizzle(UInt32 comp)
{
return (comp << kD3D9SwizzleShift) | (comp << (kD3D9SwizzleShift + 2)) | (comp << (kD3D9SwizzleShift + 4)) | (comp << (kD3D9SwizzleShift + 6));
}
static bool NextToken(const D3D9ShaderByteCode& byteCode, size_t& inOutIndex)
{
if (inOutIndex >= byteCode.size())
return false;
const UInt32 token = byteCode[inOutIndex];
const D3D9Opcode op = DecodeOpcode(token);
UInt32 length = DecodeInstructionLength(token);
// comment instructions have different length encoding
if (op == kD3D9Op_COMMENT)
length = DecodeCommentLength(token);
inOutIndex += length + 1;
if (op == kD3D9Op_END)
return false;
return true;
}
// "regular" instructions have destination + source registers right after instruction token
static bool IsRegularInstruction(D3D9Opcode op)
{
if (op == kD3D9Op_END || op == kD3D9Op_COMMENT || op == kD3D9Op_DCL || op == kD3D9Op_DEF || op == kD3D9Op_DEFI || op == kD3D9Op_DEFB)
return false;
return true;
}
static int FindUnusedTempRegisterD3D9(const D3D9ShaderByteCode& byteCode)
{
size_t index = 1;
// Find max used temporary register slot.
// HLSL compiler does fairly tight temporary register allocation,
// so we'll go with "max used + 1" as the "free register".
int maxUsed = -1;
do
{
const D3D9Opcode op = DecodeOpcode(byteCode[index]);
if (IsRegularInstruction(op))
{
const UInt32 length = DecodeInstructionLength(byteCode[index]);
for (UInt32 i = 0; i < length; ++i)
{
UInt32 token = byteCode[index + 1 + i];
D3D9Register type = DecodeRegisterType(token);
if (type == kD3D9Reg_TEMP)
{
int regIndex = DecodeRegisterIndex(token);
if (regIndex > maxUsed)
maxUsed = regIndex;
}
}
}
} while (NextToken(byteCode, index));
return maxUsed + 1;
}
// Finds output position register index in VS3.0
static int FindPositionOutputRegisterD3D9(const D3D9ShaderByteCode& byteCode)
{
size_t index = 1;
do
{
const D3D9Opcode op = DecodeOpcode(byteCode[index]);
if (op == kD3D9Op_DCL)
{
const UInt32 length = DecodeInstructionLength(byteCode[index]);
if (length >= 2)
{
UInt32 token1 = byteCode[index + 1];
UInt32 token2 = byteCode[index + 2];
if (token1 == 0x80000000 && DecodeRegisterType(token2) == kD3D9Reg_OUTPUT)
{
return DecodeRegisterIndex(token2);
}
}
}
} while (NextToken(byteCode, index));
return -1;
}
// Rewrites all usages of "src" register into "dst" one.
static void RewriteRegisterD3D9(D3D9ShaderByteCode& byteCode, D3D9Register srcType, int srcIndex, D3D9Register dstType, int dstIndex)
{
size_t index = 1;
do
{
const D3D9Opcode op = DecodeOpcode(byteCode[index]);
if (IsRegularInstruction(op))
{
const UInt32 length = DecodeInstructionLength(byteCode[index]);
for (UInt32 i = 0; i < length; ++i)
{
UInt32& token = byteCode[index + 1 + i];
if (DecodeRegisterType(token) == srcType && DecodeRegisterIndex(token) == srcIndex)
{
token &= ~0x70001800; // clear register type
token |= EncodeRegisterType(dstType);
token &= ~0x000007FF; // clear register index
token |= EncodeRegisterIndex(dstIndex);
}
}
}
} while (NextToken(byteCode, index));
}
static size_t FindEndOfShaderD3D9(const D3D9ShaderByteCode& byteCode)
{
size_t index = 1;
do
{
const D3D9Opcode op = DecodeOpcode(byteCode[index]);
if (op == kD3D9Op_END)
return index;
} while (NextToken(byteCode, index));
return index;
}
const char* PatchD3D9ShaderHalfPixelOffset(D3D9ShaderByteCode& byteCode, int constantWithFixupInfo)
{
// sanity and version checks
if (byteCode.empty())
return "Got empty vertex shader";
UInt32 shaderType, shaderVersionMajor, shaderVersionMinor;
DecodeShaderVersionD3D9(byteCode[0], &shaderType, &shaderVersionMajor, &shaderVersionMinor);
if (shaderType != kD3D9ShaderTypeVertex)
return "Got a non-vertex shader";
const bool isSM30 = (shaderVersionMajor == 3 && shaderVersionMinor == 0);
const bool isSM20 = (shaderVersionMajor == 2 && shaderVersionMinor == 0);
if (!isSM30 && !isSM20)
return "Only supports SM2.0 and SM3.0 shaders";
// overall process is:
// 1) find unused temporary register
// 2) replace all position writes to use the temporary register
// 3) insert instruction at the end to output that temporary + fixup into position
// Find vertex output register
D3D9Register positionType = kD3D9Reg_RASTOUT;
int positionIndex = 0;
if (isSM30)
{
// in SM3.0, position is part of generic output registers, so find it from declarations
positionType = kD3D9Reg_OUTPUT;
positionIndex = FindPositionOutputRegisterD3D9(byteCode);
if (positionIndex < 0)
{
return "Could not find SM3.0 vertex output register index";
}
}
// Find temporary register to use
int tempIndex = FindUnusedTempRegisterD3D9(byteCode);
if (isSM30 && tempIndex >= 32)
return "Out of temporary registers in SM3.0";
if (isSM20 && tempIndex >= 12)
return "Out of temporary registers in SM2.0";
// Rewrite positions usages into the temp
RewriteRegisterD3D9(byteCode, positionType, positionIndex, kD3D9Reg_TEMP, tempIndex);
// Insert instruction to do fixup at the end
size_t insertPos = FindEndOfShaderD3D9(byteCode);
byteCode.insert(byteCode.begin() + insertPos, 8, 0); // 5 tokens for mad, 3 tokens for mov
// mad oPos.xy, tmpPos.w, constFixup, tmpPos
byteCode[insertPos + 0] = kD3D9Op_MAD + (4 << 24);
byteCode[insertPos + 1] = EncodeRegisterIndex(positionIndex) | EncodeRegisterType(positionType) | kD3D9WriteMaskX | kD3D9WriteMaskY | 0x80000000; // oPos.xy
byteCode[insertPos + 2] = EncodeRegisterIndex(tempIndex) | EncodeRegisterType(kD3D9Reg_TEMP) | EncodeReplicateSwizzle(3) | 0x80000000; // tmpPos.w
byteCode[insertPos + 3] = EncodeRegisterIndex(constantWithFixupInfo) | EncodeRegisterType(kD3D9Reg_CONST) | kD3D9NoSwizzle | 0x80000000; // constFixup
byteCode[insertPos + 4] = EncodeRegisterIndex(tempIndex) | EncodeRegisterType(kD3D9Reg_TEMP) | kD3D9NoSwizzle | 0x80000000; // tmpPos
// mov oPos.zw, tmpPos
byteCode[insertPos + 5] = kD3D9Op_MOV + (2 << 24);
byteCode[insertPos + 6] = EncodeRegisterIndex(positionIndex) | EncodeRegisterType(positionType) | kD3D9WriteMaskZ | kD3D9WriteMaskW | 0x80000000; // oPos.zw;
byteCode[insertPos + 7] = EncodeRegisterIndex(tempIndex) | EncodeRegisterType(kD3D9Reg_TEMP) | kD3D9NoSwizzle | 0x80000000; // tmpPos
return NULL;
}
#pragma once
// Utilities to modify D3D9 shader bytecode
#include <vector>
typedef std::vector<UInt32> D3D9ShaderByteCode;
// Modifies DX9 vertex shader bytecode to adjust clip space position, so that it matches
// DX11/GL rasterization. constantWithFixupInfo is constant register index that
// will contain screen size information.
//
// Returns NULL on success, or error message on failure.
//
// The basic idea is: we can make DX9 "half texel offset" thing be completely gone, by
// shifting XY components of clip space position in all vertex shaders by half a viewport
// pixel. This is actually what DX11 9.x feature level does behind the scenes (shader compiler
// inserts the fixup, and runtime supplies the shader constant). It's also done by WebGL ANGLE
// (see "The ANGLE Project: Implementing OpenGL ES 2.0 on Direct3D" article from OpenGL Insights
// book).
//
// So we do the same here: insert fixup code in all DX9 vertex shaders. GfxDeviceD3D9 will supply
// viewport info into the shader constant at runtime.
//
// Note, this assumes constantWithFixupInfo is not already used by the shader. We detect that from
// reflection information above this call.
const char* PatchD3D9ShaderHalfPixelOffset(D3D9ShaderByteCode& byteCode, int constantWithFixupInfo);
#include "UnityPrefix.h"
#if ENABLE_UNIT_TESTS
#include "D3D9ByteCode.h"
#include "Editor/Src/Utility/d3d11/D3D11Compiler.h"
#include "Runtime/Testing/Testing.h"
#include "../ShaderCompiler.h"
extern D3D11Compiler g_D3D11Compiler;
INTEGRATION_TEST_SUITE(D3D9ByteCodeTests)
{
static std::string PatchHalfPixelOffsetVertexShaderAsm(const char* source, bool expectToFail = false)
{
HRESULT hr;
// assemble source shader string into DX9 bytecode
D3D10Blob* bytecodeBlob;
hr = g_D3D11Compiler.D3DAssemble(source, strlen(source), 0, &bytecodeBlob, NULL);
if (FAILED_IMPL(hr))
{
ErrorString("Failed to assemble source shader");
return "";
}
const size_t bytecodeSize = bytecodeBlob->GetBufferSize();
if (bytecodeSize % 4 != 0)
{
ErrorString("Assembled shader bytecode size should be multiple of 4");
return "";
}
// get bytecode into a vector of tokens
D3D9ShaderByteCode bytecode;
bytecode.reserve(bytecodeSize / 4);
const UInt8* ptr = (const UInt8*)bytecodeBlob->GetBufferPointer();
for (size_t i = 0; i < bytecodeSize; i += 4)
{
bytecode.push_back(*(const UInt32*)(ptr + i));
}
bytecodeBlob->Release();
// insert position fixup
const char* msg = PatchD3D9ShaderHalfPixelOffset(bytecode, 255);
if (expectToFail)
{
return msg;
}
// disassemble the shader
D3D10Blob* disasm;
hr = g_D3D11Compiler.D3DDisassemble(&bytecode[0], bytecode.size()*4, 0, NULL, &disasm);
if (FAILED_IMPL(hr))
{
ErrorString("Failed to disassemble patched shader");
return "";
}
std::string res = (const char*)disasm->GetBufferPointer();
disasm->Release();
// remove comments, indentation, trim
RemoveCommentsFromAsm(res);
res = DeindentAsmCode(res);
res = Trim(res, " \r\n\t");
return res;
}
TEST(Trivial_VS20_Works)
{
const char* src =
"vs_2_0\n"
"def c0, 1, 0, 0, 0\n"
"mov oPos, c0.x\n";
std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
std::string exp =
"vs_2_0\n"
"def c0, 1, 0, 0, 0\n"
"mov r0, c0.x\n"
"mad oPos.xy, r0.w, c255, r0\n"
"mov oPos.zw, r0";
CHECK_EQUAL(exp, res);
}
TEST(Trivial_VS30_Works)
{
const char* src =
"vs_3_0\n"
"def c0, 1, 0, 0, 0\n"
"dcl_position o0\n"
"mov o0, c0.x\n";
std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
std::string exp =
"vs_3_0\n"
"def c0, 1, 0, 0, 0\n"
"dcl_position o0\n"
"mov r0, c0.x\n"
"mad o0.xy, r0.w, c255, r0\n"
"mov o0.zw, r0";
CHECK_EQUAL(exp, res);
}
TEST(Patching_Handles_Comments_Correctly)
{
const char* src =
"// one comment there\n"
"vs_3_0\n"
"// another comment right here\n"
"def c0, 1, 0, 0, 0\n"
"// another comment here, trying to make it fairly long to see if we correctly decode the length field. Still making it long...\n"
"dcl_position o0\n"
"// what do you know, another comment\n"
"mov o0, c0.x\n"
"// perhaps unexpectedly, a comment at the end!\n";
PatchHalfPixelOffsetVertexShaderAsm(src);
}
TEST(NonTrivial_VS20_Works)
{
const char* src =
"vs_2_0\n"
"dcl_position v0\n"
"dcl_texcoord v1\n"
"pow r0.x, v1.x, v1.y\n"
"mul r0.xy, r0.x, v1\n"
"add oT0.xy, r0.y, r0.x\n"
"add oT1.xyz, -v0, c4\n"
"mul oD0, v0, c4\n"
"dp4 oPos.x, v0, c0\n"
"dp4 oPos.y, v0, c1\n"
"dp4 oPos.z, v0, c2\n"
"dp4 oPos.w, v0, c3\n";
std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
std::string exp =
"vs_2_0\n"
"dcl_position v0\n"
"dcl_texcoord v1\n"
"pow r0.x, v1.x, v1.y\n"
"mul r0.xy, r0.x, v1\n"
"add oT0.xy, r0.y, r0.x\n"
"add oT1.xyz, -v0, c4\n"
"mul oD0, v0, c4\n"
"dp4 r1.x, v0, c0\n"
"dp4 r1.y, v0, c1\n"
"dp4 r1.z, v0, c2\n"
"dp4 r1.w, v0, c3\n"
"mad oPos.xy, r1.w, c255, r1\n"
"mov oPos.zw, r1";
CHECK_EQUAL(exp, res);
}
TEST(NonTrivial_VS30_Works)
{
const char* src =
"vs_3_0\n"
"dcl_position v0\n"
"dcl_texcoord v1\n"
"dcl_texcoord o0.xy\n"
"dcl_texcoord1 o1.xyz\n"
"dcl_color o2\n"
"dcl_position o3\n"
"pow r0.x, v1.x, v1.y\n"
"mul r0.xy, r0.x, v1\n"
"add o0.xy, r0.y, r0.x\n"
"add o1.xyz, c4, -v0\n"
"mul o2, c4, v0\n"
"dp4 o3.x, v0, c0\n"
"dp4 o3.y, v0, c1\n"
"dp4 o3.z, v0, c2\n"
"dp4 o3.w, v0, c3\n";
std::string res = PatchHalfPixelOffsetVertexShaderAsm(src);
std::string exp =
"vs_3_0\n"
"dcl_position v0\n"
"dcl_texcoord v1\n"
"dcl_texcoord o0.xy\n"
"dcl_texcoord1 o1.xyz\n"
"dcl_color o2\n"
"dcl_position o3\n"
"pow r0.x, v1.x, v1.y\n"
"mul r0.xy, r0.x, v1\n"
"add o0.xy, r0.y, r0.x\n"
"add o1.xyz, c4, -v0\n"
"mul o2, c4, v0\n"
"dp4 r1.x, v0, c0\n"
"dp4 r1.y, v0, c1\n"
"dp4 r1.z, v0, c2\n"
"dp4 r1.w, v0, c3\n"
"mad o3.xy, r1.w, c255, r1\n"
"mov o3.zw, r1";
CHECK_EQUAL(exp, res);
}
// tests for error conditions
TEST(Error_PS20_Input_IsRejected)
{
const char* src =
"ps_2_0\n"
"mov oC0, c0\n";
std::string msg = PatchHalfPixelOffsetVertexShaderAsm(src, true);
CHECK_MSG(!msg.empty(), "Error message should be returned");
}
TEST(Error_VS20_With12TempRegisterUsed_IsRejected)
{
const char* src =
"vs_2_0\n"
"mov r11, c0\n" // we only scan for highest used register index, this will make us think all 12 temps in VS2.0 are used
"mov oPos, r11\n";
std::string msg = PatchHalfPixelOffsetVertexShaderAsm(src, true);
CHECK_MSG(!msg.empty(), "Error message should be returned");
}
} // UNIT_TEST_SUITE(D3D9ByteCodeTests)
#endif // ENABLE_UNIT_TESTS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment