jdryg/swr.c

## swr.c
#include "swr.h"
#include "swr_math.h"
#include <stdbool.h>
#include <malloc.h>
#include <memory.h>
#include <string.h>
#include <assert.h>
#include <immintrin.h>

#define SWR_CONFIG_NO_PIXEL_SHADER 0

static swr_context* swrCreateContext(uint32_t w, uint32_t h);
static void swrDestroyContext(swr_context* ctx);
static void swrClear(swr_context* ctx, uint32_t color);
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color);
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color);
static void swrDrawTriangleDispatch(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef_Hierarchical(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef_HierarchicalLRB_NoCond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_Ref(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color);

swr_api* swr = &(swr_api){
	.createContext = swrCreateContext,
	.destroyContext = swrDestroyContext,
	.clear = swrClear,
	.drawPixel = swrDrawPixel,
	.drawLine = swrDrawLine,
	.drawTriangle = swrDrawTriangleDispatch,
	.drawText = swrDrawText
};

static swr_context* swrCreateContext(uint32_t w, uint32_t h)
{
	swr_context* ctx = (swr_context*)malloc(sizeof(swr_context));
	if (!ctx) {
		return NULL;
	}

	memset(ctx, 0, sizeof(swr_context));
	ctx->m_FrameBuffer = (uint32_t*)malloc(sizeof(uint32_t) * (size_t)w * (size_t)h);
	if (!ctx->m_FrameBuffer) {
		swrDestroyContext(ctx);
		return NULL;
	}

	memset(ctx->m_FrameBuffer, 0, sizeof(uint32_t) * (size_t)w * (size_t)h);
	ctx->m_Width = w;
	ctx->m_Height = h;

	return ctx;
}

static void swrDestroyContext(swr_context* ctx)
{
	free(ctx->m_FrameBuffer);
	free(ctx);
}

static void swrClear(swr_context* ctx, uint32_t color)
{
	uint32_t* buffer = ctx->m_FrameBuffer;
	const uint32_t numPixels = ctx->m_Width * ctx->m_Height;
	for (uint32_t i = 0; i < numPixels; ++i) {
		*buffer++ = color;
	}
}

static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color)
{
	if (x < 0 || x >= (int32_t)ctx->m_Width || y < 0 || y >= (int32_t)ctx->m_Height) {
		return;
	}
	ctx->m_FrameBuffer[x + y * ctx->m_Width] = color;
}

static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color)
{
	bool steep = false;
	if (swr_absi(x0 - x1) < swr_absi(y0 - y1)) {
		{ int32_t tmp = x0; x0 = y0; y0 = tmp; }
		{ int32_t tmp = x1; x1 = y1; y1 = tmp; }
		steep = true;
	}

	if (x0 > x1) {
		{ int32_t tmp = x0; x0 = x1; x1 = tmp; }
		{ int32_t tmp = y0; y0 = y1; y1 = tmp; }
	}

	const int32_t dx = x1 - x0;
	const int32_t derror2 = swr_absi(y1 - y0) * 2;
	const int32_t yinc = y1 > y0 ? 1 : -1;

	int32_t error2 = 0;
	int32_t y = y0;

	if (steep) {
		for (int32_t x = x0; x <= x1; x++) {
			swrDrawPixel(ctx, y, x, color);

			error2 += derror2;
			if (error2 > dx) {
				y += yinc;
				error2 -= dx * 2;
			}
		}
	} else {
		for (int32_t x = x0; x <= x1; x++) {
			swrDrawPixel(ctx, x, y, color);

			error2 += derror2;
			if (error2 > dx) {
				y += yinc;
				error2 -= dx * 2;
			}
		}
	}
}

static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color)
{
	end = end != NULL
		? end
		: str + strlen(str)
		;

	const int32_t chw = (int32_t)font->m_CharWidth;
	const int32_t chh = (int32_t)font->m_CharHeight;
	const uint8_t* chdata = font->m_CharData;

	int32_t x = x0;
	int32_t y = y0;
	while (str != end) {
		char ch = *str;
		if (ch < font->m_CharMin || ch > font->m_CharMax) {
			ch = font->m_MissingCharFallbackID;
		}

		const uint8_t chID = (uint8_t)ch - font->m_CharMin;
		const uint8_t* charData = &chdata[chID * chh];
		for (int32_t chy = 0; chy < chh; ++chy) {
			const uint8_t chrow = charData[chy];
			for (int32_t chx = 0; chx < chw; ++chx) {
				if ((chrow & (1u << chx)) != 0) {
					swrDrawPixel(ctx, x + chx, y + chy, color);
				}
			}
		}

		x += chw;

		++str;
	}
}

static void swrDrawTriangleDispatch(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// TODO: Check CPU caps
#if 1
	swr->drawTriangle = swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2;
#elif 1
	swr->drawTriangle = swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2;
#elif 1
	swr->drawTriangle = swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2;
#else
	swr->drawTriangle = swrDrawTriangleRef_HierarchicalLRB_NoCond;
#endif

	swr->drawTriangle(ctx, x0, y0, x1, y1, x2, y2, color0, color1, color2);
}

//////////////////////////////////////////////////////////////////////////
// swrDrawTriangle() implementations
//
typedef struct swr_edge
{
	int32_t m_x0;
	int32_t m_y0;
	int32_t m_dx;
	int32_t m_dy;
} swr_edge;

static inline swr_edge swr_edgeInit(int32_t x0, int32_t y0, int32_t x1, int32_t y1)
{
	return (swr_edge){
		.m_x0 = x0,
		.m_y0 = y0,
		.m_dx = (y1 - y0),
		.m_dy = (x0 - x1),
	};
}

static inline int32_t swr_edgeEval(swr_edge edge, int32_t x, int32_t y)
{
	return 0
		+ (x - edge.m_x0) * edge.m_dx
		+ (y - edge.m_y0) * edge.m_dy
		;
}

// Reference implementation
// https://fgiesen.wordpress.com/2013/02/08/triangle-rasterization-in-practice/
// NOTE: No fill rule used. All pixels lying ON an edge are drawn.
static void swrDrawTriangleRef(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t minX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t minY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t maxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t maxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxWidth = maxX - minX;
	const int32_t bboxHeight = maxY - minY;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
	const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
	const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
	const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
	const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
	const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
	const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
	const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
	const int32_t w0_pmin = swr_edgeEval(edge0, minX, minY);
	const int32_t w1_pmin = swr_edgeEval(edge1, minX, minY);
	const int32_t w2_pmin = swr_edgeEval(edge2, minX, minY);

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const float inv_area = 1.0f / (float)iarea;
#endif

	// Rasterize
	int32_t w0_row = w0_pmin;
	int32_t w1_row = w1_pmin;
	int32_t w2_row = w2_pmin;
	uint32_t* fb_row = &ctx->m_FrameBuffer[minX + minY * ctx->m_Width];

	for (int32_t py = 0; py <= bboxHeight; ++py) {
		int32_t pxmin = 0;
		int32_t pxmax = bboxWidth;

		// Calculate the range of x values for which the barycentric coordinates
		// will always be greater than or equal to 0.
		{
			// The barycentric coordinates are linear functions: w_pmin + i * w_px
			//
			// The inequality w_pmin + i * w_px >= 0 holds for all i's in the range:
			// 1. w_pmin >= 0 && w_px >= 0 : [0, bboxWidth]
			// 2. w_pmin >= 0 && w_px < 0  : [0, imax]         where imax = -(w_pmin / w_px)
			// 3. w_pmin < 0  && w_px > 0  : [imin, bboxWidth] where imin = -(w_pmin / w_px) + 1
			// 4. w_pmin < 0  && w_px <= 0 : never
			//
			// From the 3 barycentric coordinates we have 3 equations. All of them
			// should be greater than or equal to 0 to draw a pixel.

			// Make sure we aren't in an invalid state.
			assert(!(w0_row < 0 && edge0.m_dx <= 0));
			assert(!(w1_row < 0 && edge1.m_dx <= 0));
			assert(!(w2_row < 0 && edge2.m_dx <= 0));

			// Calculate x range based on w0...
			if (w0_row >= 0 && edge0.m_dx < 0) {
				pxmax = swr_mini(pxmax, -(w0_row / edge0.m_dx));
			} else if (w0_row < 0 && edge0.m_dx > 0) {
				pxmin = swr_maxi(pxmin, (-w0_row / edge0.m_dx) + ((-w0_row % edge0.m_dx) != 0 ? 1 : 0));
			}

			// Calculate x range based on w1...
			if (w1_row >= 0 && edge1.m_dx < 0) {
				pxmax = swr_mini(pxmax, -(w1_row / edge1.m_dx));
			} else if (w1_row < 0 && edge1.m_dx > 0) {
				pxmin = swr_maxi(pxmin, (-w1_row / edge1.m_dx) + ((-w1_row % edge1.m_dx) != 0 ? 1 : 0));
			}

			// Calculate x range based on w2...
			if (w2_row >= 0 && edge2.m_dx < 0) {
				pxmax = swr_mini(pxmax, -(w2_row / edge2.m_dx));
			} else if (w2_row < 0 && edge2.m_dx > 0) {
				pxmin = swr_maxi(pxmin, (-w2_row / edge2.m_dx) + ((-w2_row % edge2.m_dx) != 0 ? 1 : 0));
			}
		}

		// Calculate barycentric coords at pxmin
		int32_t w0 = w0_row + pxmin * edge0.m_dx;
		int32_t w1 = w1_row + pxmin * edge1.m_dx;
		int32_t w2 = w2_row + pxmin * edge2.m_dx;

		for (int32_t px = pxmin; px <= pxmax; ++px) {
			// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
			// Render the pixel
			{
				assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
				const uint32_t rgba = 0xFFFFFFFF;
#else
				const float l0 = (float)w0 * inv_area;
				const float l1 = (float)w1 * inv_area;

				// l2 = 1.0f - (l0 + l1)
				//
				// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
				// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
				// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
				// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
				//
				// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
				const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
				const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
				const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
				const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
				const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif

				fb_row[px] = rgba;
			}

			w0 += edge0.m_dx;
			w1 += edge1.m_dx;
			w2 += edge2.m_dx;
		}

		w0_row += edge0.m_dy;
		w1_row += edge1.m_dy;
		w2_row += edge2.m_dy;
		fb_row += ctx->m_Width;
	}
}

static const uint32_t kBlockSize = 4;

#define SWR_CONFIG_USE_POSITIVE_MASKS 0

// Example 4x1 row:
// *---*---*---*---*
// | A |   |   | B |
// *---*---*---*---*
//
// A = (blockMinX, y)
// B = (blockMaxX, y)
//
// Case | A | B | result
// -----|---|---|----------------------------------
//  00  | - | - | not covered
//  01  | - | + | partially covered, [xmin, blockMaxX]
//  10  | + | - | partially covered, [blockMinX, xmax]
//  11  | + | + | fully covered

#define SWR_ROW_MASK_A_Pos 1
#define SWR_ROW_MASK_A_Msk (0x01 << SWR_ROW_MASK_A_Pos)
#define SWR_ROW_MASK_B_Pos 0
#define SWR_ROW_MASK_B_Msk (0x01 << SWR_ROW_MASK_B_Pos)

#if SWR_CONFIG_USE_POSITIVE_MASKS
#define SWR_ROW_MASK(wA, wB) (0 \
	| ((wA) >= 0 ? SWR_ROW_MASK_A_Msk : 0x00) \
	| ((wB) >= 0 ? SWR_ROW_MASK_B_Msk : 0x00) \
)

#define SWR_ROW_MASK_EMPTY                        SWR_ROW_MASK(-1, -1)
#define SWR_ROW_MASK_FULL                         SWR_ROW_MASK(1, 1)
#define SWR_ROW_MASK_X_MIN(msk)                   (((msk) & SWR_ROW_MASK_A_Msk) == 0)
#define SWR_ROW_MASK_X_MAX(msk)                   (((msk) & SWR_ROW_MASK_B_Msk) == 0)
#define SWR_ROW_MASK_ANY_EMPTY(msk0, msk1, msk2)  ((((msk0) == SWR_ROW_MASK_EMPTY) || ((msk1) == SWR_ROW_MASK_EMPTY) || ((msk2) == SWR_ROW_MASK_EMPTY)))
#define SWR_ROW_MASK_ALL_FULL(msk0, msk1, msk2)   (((msk0) & (msk1) & (msk2)) == SWR_ROW_MASK_FULL)
#else
#define SWR_ROW_MASK(wA, wB) (0 \
	| ((wA) < 0 ? SWR_ROW_MASK_A_Msk : 0x00) \
	| ((wB) < 0 ? SWR_ROW_MASK_B_Msk : 0x00) \
)

#define SWR_ROW_MASK_EMPTY                        SWR_ROW_MASK(-1, -1)
#define SWR_ROW_MASK_FULL                         SWR_ROW_MASK(1, 1)
#define SWR_ROW_MASK_X_MIN(msk)                   (((msk) & SWR_ROW_MASK_A_Msk) == SWR_ROW_MASK_A_Msk)
#define SWR_ROW_MASK_X_MAX(msk)                   (((msk) & SWR_ROW_MASK_B_Msk) == SWR_ROW_MASK_B_Msk)
#define SWR_ROW_MASK_ANY_EMPTY(msk0, msk1, msk2)  ((((msk0) == SWR_ROW_MASK_EMPTY) || ((msk1) == SWR_ROW_MASK_EMPTY) || ((msk2) == SWR_ROW_MASK_EMPTY)))
#define SWR_ROW_MASK_ALL_FULL(msk0, msk1, msk2)   (((msk0) | (msk1) | (msk2)) == SWR_ROW_MASK_FULL)
#endif

// Example 4x4 block:
//
// *---*---*---*---*
// | A |   |   | B |
// *---*---*---*---*
// |   |   |   |   |
// *---*---*---*---*
// |   |   |   |   |
// *---*---*---*---*
// | D |   |   | C |
// *---*---*---*---*
//
// A = (blockMinX, blockMinY)
// B = (blockMaxX, blockMinY)
// C = (blockMaxX, blockMaxY)
// D = (blockMinX, blockMaxY)
//
// Case | A | B | C | D | result
// -----|---|---|---|---|----------------------------------
// 0000 | - | - | - | - | not covered
// 0001 | - | - | - | + | partially covered, [ymin, blockMaxY]
// 0010 | - | - | + | - | partially covered, [ymin, blockMaxY]
// 0011 | - | - | + | + | partially covered, [ymin, blockMaxY]
// 0100 | - | + | - | - | partially covered, [blockMinY, ymax]
// 0101 | - | + | - | + | invalid configuration
// 0110 | - | + | + | - | partially covered, [blockMinY, blockMaxY]
// 0111 | - | + | + | + | partially covered, [blockMinY, blockMaxY]
// 1000 | + | - | - | - | partially covered, [blockMinY, ymax]
// 1001 | + | - | - | + | partially covered, [blockMinY, blockMaxY]
// 1010 | + | - | + | - | invalid configuration
// 1011 | + | - | + | + | partially covered, [blockMinY, blockMaxY]
// 1100 | + | + | - | - | partially covered, [blockminY, ymax]
// 1101 | + | + | - | + | partially covered, [blockMinY, blockMaxY]
// 1110 | + | + | + | - | partially covered, [blockMinY, blockMaxY]
// 1111 | + | + | + | + | fully covered

#define SWR_BLOCK_MASK_A_Pos 3
#define SWR_BLOCK_MASK_A_Msk (0x01 << SWR_BLOCK_MASK_A_Pos)
#define SWR_BLOCK_MASK_B_Pos 2
#define SWR_BLOCK_MASK_B_Msk (0x01 << SWR_BLOCK_MASK_B_Pos)
#define SWR_BLOCK_MASK_C_Pos 1
#define SWR_BLOCK_MASK_C_Msk (0x01 << SWR_BLOCK_MASK_C_Pos)
#define SWR_BLOCK_MASK_D_Pos 0
#define SWR_BLOCK_MASK_D_Msk (0x01 << SWR_BLOCK_MASK_D_Pos)

#if SWR_CONFIG_USE_POSITIVE_MASKS
#define SWR_BLOCK_MASK(wA, wB, wC, wD) (0 \
	| ((wA) >= 0 ? SWR_BLOCK_MASK_A_Msk : 0x00) \
	| ((wB) >= 0 ? SWR_BLOCK_MASK_B_Msk : 0x00) \
	| ((wC) >= 0 ? SWR_BLOCK_MASK_C_Msk : 0x00) \
	| ((wD) >= 0 ? SWR_BLOCK_MASK_D_Msk : 0x00) \
)

#define SWR_BLOCK_MASK_EMPTY                       SWR_BLOCK_MASK(-1, -1, -1, -1)
#define SWR_BLOCK_MASK_FULL                        SWR_BLOCK_MASK(1, 1, 1, 1)
#define SWR_BLOCK_MASK_IS_VALID(msk)               (((msk) != SWR_BLOCK_MASK(-1, 1, -1, 1)) && ((msk) != SWR_BLOCK_MASK(1, -1, 1, -1)))
#define SWR_BLOCK_MASK_Y_MIN(msk)                  (((msk) & (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) == 0)
#define SWR_BLOCK_MASK_Y_MAX(msk)                  (((msk) & (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) == 0)
#define SWR_BLOCK_MASK_ALL_FULL(msk0, msk1, msk2)  (((msk0) & (msk1) & (msk2)) == SWR_BLOCK_MASK_FULL)
#else
#define SWR_BLOCK_MASK(wA, wB, wC, wD) (0 \
	| ((wA) < 0 ? SWR_BLOCK_MASK_A_Msk : 0x00) \
	| ((wB) < 0 ? SWR_BLOCK_MASK_B_Msk : 0x00) \
	| ((wC) < 0 ? SWR_BLOCK_MASK_C_Msk : 0x00) \
	| ((wD) < 0 ? SWR_BLOCK_MASK_D_Msk : 0x00) \
)

#define SWR_BLOCK_MASK_EMPTY                       SWR_BLOCK_MASK(-1, -1, -1, -1)
#define SWR_BLOCK_MASK_FULL                        SWR_BLOCK_MASK(1, 1, 1, 1)
#define SWR_BLOCK_MASK_IS_VALID(msk)               (((msk) != SWR_BLOCK_MASK(-1, 1, -1, 1)) && ((msk) != SWR_BLOCK_MASK(1, -1, 1, -1)))
#define SWR_BLOCK_MASK_Y_MIN(msk)                  (((msk) & (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) == (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk))
#define SWR_BLOCK_MASK_Y_MAX(msk)                  (((msk) & (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) == (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk))
#define SWR_BLOCK_MASK_ALL_FULL(msk0, msk1, msk2)  (((msk0) | (msk1) | (msk2)) == SWR_BLOCK_MASK_FULL)
#endif

static void swrDrawTriangleRef_Hierarchical(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
	const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
	const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
	const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
	const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
	const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
	const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
	const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const float inv_area = 1.0f / (float)iarea;
#endif

	// Rasterize
	for (int32_t blockMinY = bboxMinY_aligned, blockMaxY = bboxMinY_aligned + kBlockSize - 1;
		blockMinY < bboxMaxY;
		blockMinY += kBlockSize, blockMaxY += kBlockSize) {

		for (int32_t blockMinX = bboxMinX_aligned, blockMaxX = bboxMinX_aligned + kBlockSize - 1;
			blockMinX < bboxMaxX;
			blockMinX += kBlockSize, blockMaxX += kBlockSize) {

			// Evaluate 1st edge function at the 4 block corners. If all of the signed
			// distances are negative (all sign bits are 1) then the block will be empty.
			const int32_t w0_A = swr_edgeEval(edge0, blockMinX, blockMinY);
			const int32_t w0_B = swr_edgeEval(edge0, blockMaxX, blockMinY);
			const int32_t w0_C = swr_edgeEval(edge0, blockMaxX, blockMaxY);
			const int32_t w0_D = swr_edgeEval(edge0, blockMinX, blockMaxY);
			const uint32_t w0_blockMsk = SWR_BLOCK_MASK(w0_A, w0_B, w0_C, w0_D);
			assert(SWR_BLOCK_MASK_IS_VALID(w0_blockMsk));
			if (w0_blockMsk == SWR_BLOCK_MASK_EMPTY) {
				continue;
			}

			// Evaluate 2nd edge function at the 4 block corners. If all of the signed
			// distances are negative (all sign bits are 1) then the block will be empty.
			const int32_t w1_A = swr_edgeEval(edge1, blockMinX, blockMinY);
			const int32_t w1_B = swr_edgeEval(edge1, blockMaxX, blockMinY);
			const int32_t w1_C = swr_edgeEval(edge1, blockMaxX, blockMaxY);
			const int32_t w1_D = swr_edgeEval(edge1, blockMinX, blockMaxY);
			const uint32_t w1_blockMsk = SWR_BLOCK_MASK(w1_A, w1_B, w1_C, w1_D);
			assert(SWR_BLOCK_MASK_IS_VALID(w1_blockMsk));
			if (w1_blockMsk == SWR_BLOCK_MASK_EMPTY) {
				continue;
			}

			// Evaluate 3rd edge function at the 4 block corners. If all of the signed
			// distances are negative (all sign bits are 1) then the block will be empty.
			const int32_t w2_A = swr_edgeEval(edge2, blockMinX, blockMinY);
			const int32_t w2_B = swr_edgeEval(edge2, blockMaxX, blockMinY);
			const int32_t w2_C = swr_edgeEval(edge2, blockMaxX, blockMaxY);
			const int32_t w2_D = swr_edgeEval(edge2, blockMinX, blockMaxY);
			const uint32_t w2_blockMsk = SWR_BLOCK_MASK(w2_A, w2_B, w2_C, w2_D);
			assert(SWR_BLOCK_MASK_IS_VALID(w2_blockMsk));
			if (w2_blockMsk == SWR_BLOCK_MASK_EMPTY) {
				continue;
			}

			if (!SWR_BLOCK_MASK_ALL_FULL(w0_blockMsk, w1_blockMsk, w2_blockMsk)) {
				// Partial block
				int32_t pymin = 0;
				int32_t pymax = kBlockSize - 1;

				{
					if (SWR_BLOCK_MASK_Y_MAX(w0_blockMsk)) {
						const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
						pymax = swr_mini(pymax, w_pymax);
					} else if (SWR_BLOCK_MASK_Y_MIN(w0_blockMsk)) {
						const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
						pymin = swr_maxi(pymin, w_pymin);
					}

					if (SWR_BLOCK_MASK_Y_MAX(w1_blockMsk)) {
						const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
						pymax = swr_mini(pymax, w_pymax);
					} else if (SWR_BLOCK_MASK_Y_MIN(w1_blockMsk)) {
						const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
						pymin = swr_maxi(pymin, w_pymin);
					}

					if (SWR_BLOCK_MASK_Y_MAX(w2_blockMsk)) {
						const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
						pymax = swr_mini(pymax, w_pymax);
					} else if (SWR_BLOCK_MASK_Y_MIN(w2_blockMsk)) {
						const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
						pymin = swr_maxi(pymin, w_pymin);
					}
				}

				// Evaluate edge functions at the first row.
				int32_t w0_blockMinX_py = swr_edgeEval(edge0, blockMinX, blockMinY + pymin);
				int32_t w1_blockMinX_py = swr_edgeEval(edge1, blockMinX, blockMinY + pymin);
				int32_t w2_blockMinX_py = swr_edgeEval(edge2, blockMinX, blockMinY + pymin);
				int32_t w0_blockMaxX_py = w0_blockMinX_py + edge0.m_dx * (kBlockSize - 1);
				int32_t w1_blockMaxX_py = w1_blockMinX_py + edge1.m_dx * (kBlockSize - 1);
				int32_t w2_blockMaxX_py = w2_blockMinX_py + edge2.m_dx * (kBlockSize - 1);

				for (int32_t py = pymin; py <= pymax; ++py) {
					const int32_t w0_rowMsk = SWR_ROW_MASK(w0_blockMinX_py, w0_blockMaxX_py);
					const int32_t w1_rowMsk = SWR_ROW_MASK(w1_blockMinX_py, w1_blockMaxX_py);
					const int32_t w2_rowMsk = SWR_ROW_MASK(w2_blockMinX_py, w2_blockMaxX_py);
					assert(w0_rowMsk != SWR_ROW_MASK_EMPTY);
					assert(w1_rowMsk != SWR_ROW_MASK_EMPTY);
					assert(w2_rowMsk != SWR_ROW_MASK_EMPTY);

					uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + (blockMinY + py) * ctx->m_Width];

					int32_t pxmin = 0;
					int32_t pxmax = (int32_t)kBlockSize - 1;

					if (!SWR_ROW_MASK_ALL_FULL(w0_rowMsk, w1_rowMsk, w2_rowMsk)) {
						if (SWR_ROW_MASK_X_MAX(w0_rowMsk)) {
							const int32_t w_pxmax = swr_idiv_floor(w0_blockMinX_py, -edge0.m_dx);
							pxmax = swr_mini(pxmax, w_pxmax);
						} else if (SWR_ROW_MASK_X_MIN(w0_rowMsk)) {
							const int32_t w_pxmin = swr_idiv_ceil(-w0_blockMinX_py, edge0.m_dx);
							pxmin = swr_maxi(pxmin, w_pxmin);
						}

						if (SWR_ROW_MASK_X_MAX(w1_rowMsk)) {
							const int32_t w_pxmax = swr_idiv_floor(w1_blockMinX_py, -edge1.m_dx);
							pxmax = swr_mini(pxmax, w_pxmax);
						} else if (SWR_ROW_MASK_X_MIN(w1_rowMsk)) {
							const int32_t w_pxmin = swr_idiv_ceil(-w1_blockMinX_py, edge1.m_dx);
							pxmin = swr_maxi(pxmin, w_pxmin);
						}

						if (SWR_ROW_MASK_X_MAX(w2_rowMsk)) {
							const int32_t w_pxmax = swr_idiv_floor(w2_blockMinX_py, -edge2.m_dx);
							pxmax = swr_mini(pxmax, w_pxmax);
						} else if (SWR_ROW_MASK_X_MIN(w2_rowMsk)) {
							const int32_t w_pxmin = swr_idiv_ceil(-w2_blockMinX_py, edge2.m_dx);
							pxmin = swr_maxi(pxmin, w_pxmin);
						}
					}

					// Calculate barycentric coords at pxmin
					int32_t w0 = w0_blockMinX_py + pxmin * edge0.m_dx;
					int32_t w1 = w1_blockMinX_py + pxmin * edge1.m_dx;
					int32_t w2 = w2_blockMinX_py + pxmin * edge2.m_dx;

					for (int32_t px = pxmin; px <= pxmax; ++px) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
							assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
							const uint32_t rgba = 0xFFFFFFFF;
#else
							const float l0 = (float)w0 * inv_area;
							const float l1 = (float)w1 * inv_area;

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
							const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
							const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
							const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
							const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif

							fb_row[px] = rgba;
						}

						w0 += edge0.m_dx;
						w1 += edge1.m_dx;
						w2 += edge2.m_dx;
					}

					w0_blockMinX_py += edge0.m_dy;
					w1_blockMinX_py += edge1.m_dy;
					w2_blockMinX_py += edge2.m_dy;
					w0_blockMaxX_py += edge0.m_dy;
					w1_blockMaxX_py += edge1.m_dy;
					w2_blockMaxX_py += edge2.m_dy;
				}
			} else {
				// Full block
				int32_t w0_row = w0_A;
				int32_t w1_row = w1_A;
				int32_t w2_row = w2_A;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					// Calculate barycentric coords at pxmin
					int32_t w0 = w0_row;
					int32_t w1 = w1_row;
					int32_t w2 = w2_row;

					for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
							assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
							const uint32_t rgba = 0xFFFFFFFF;
#else
							const float l0 = (float)w0 * inv_area;
							const float l1 = (float)w1 * inv_area;

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
							const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
							const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
							const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
							const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif

							fb_row[px] = rgba;
						}

						w0 += edge0.m_dx;
						w1 += edge1.m_dx;
						w2 += edge2.m_dx;
					}

					w0_row += edge0.m_dy;
					w1_row += edge1.m_dy;
					w2_row += edge2.m_dy;
					fb_row += ctx->m_Width;
				}
			}
		}
	}
}

#define SWR_ANY_NEGATIVE3(a, b, c)    (((a) | (b) | (c)) < 0)

// 2-level hierarchical rasterization using trivial reject/accept corners.
//
// Fully covered blocks are rasterized without any conditionals in the inner loops.
//
// Partially covered blocks are rasterized conditionally by keeping track of the
// edge function values at each block row's min. Only completely uncovered rows
// are skipped.
static void swrDrawTriangleRef_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
	const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
	const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
	const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
	const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
	const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
	const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
	const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const float inv_area = 1.0f / (float)iarea;
#endif

	// Trivial reject/accept corner offsets relative to block min/max.
	const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1);
	const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1);
	const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1);
	const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1);
	const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1);
	const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1);
	const int32_t trivialRejectOffset0 = 0
		+ (edge0.m_dx >= 0 ? w0_blockMax_dx : 0)
		+ (edge0.m_dy >= 0 ? w0_blockMax_dy : 0)
		;
	const int32_t trivialRejectOffset1 = 0
		+ (edge1.m_dx >= 0 ? w1_blockMax_dx : 0)
		+ (edge1.m_dy >= 0 ? w1_blockMax_dy : 0)
		;
	const int32_t trivialRejectOffset2 = 0
		+ (edge2.m_dx >= 0 ? w2_blockMax_dx : 0)
		+ (edge2.m_dy >= 0 ? w2_blockMax_dy : 0)
		;

	const int32_t trivialAcceptOffset0 = (w0_blockMax_dx + w0_blockMax_dy) - trivialRejectOffset0;
	const int32_t trivialAcceptOffset1 = (w1_blockMax_dx + w1_blockMax_dy) - trivialRejectOffset1;
	const int32_t trivialAcceptOffset2 = (w2_blockMax_dx + w2_blockMax_dy) - trivialRejectOffset2;

	const int32_t trivialRejectOffset0_dx = 0
		+ (edge0.m_dx >= 0 ? w0_blockMax_dx : 0)
		;
	const int32_t trivialRejectOffset1_dx = 0
		+ (edge1.m_dx >= 0 ? w1_blockMax_dx : 0)
		;
	const int32_t trivialRejectOffset2_dx = 0
		+ (edge2.m_dx >= 0 ? w2_blockMax_dx : 0)
		;

	// Rasterize
	const int32_t w0_bboxMin = swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned);
	const int32_t w1_bboxMin = swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned);
	const int32_t w2_bboxMin = swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned);

	const int32_t w0_nextBlock_dx = edge0.m_dx * kBlockSize;
	const int32_t w0_nextBlock_dy = edge0.m_dy * kBlockSize;
	const int32_t w1_nextBlock_dx = edge1.m_dx * kBlockSize;
	const int32_t w1_nextBlock_dy = edge1.m_dy * kBlockSize;
	const int32_t w2_nextBlock_dx = edge2.m_dx * kBlockSize;
	const int32_t w2_nextBlock_dy = edge2.m_dy * kBlockSize;

	int32_t w0_blockY = w0_bboxMin;
	int32_t w1_blockY = w1_bboxMin;
	int32_t w2_blockY = w2_bboxMin;
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
		int32_t w0_blockMin = w0_blockY;
		int32_t w1_blockMin = w1_blockY;
		int32_t w2_blockMin = w2_blockY;
		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const int32_t w0_trivialReject = w0_blockMin + trivialRejectOffset0;
			const int32_t w1_trivialReject = w1_blockMin + trivialRejectOffset1;
			const int32_t w2_trivialReject = w2_blockMin + trivialRejectOffset2;
			if (SWR_ANY_NEGATIVE3(w0_trivialReject, w1_trivialReject, w2_trivialReject)) {
				w0_blockMin += w0_nextBlock_dx;
				w1_blockMin += w1_nextBlock_dx;
				w2_blockMin += w2_nextBlock_dx;
				continue;
			}

			// At this point we know that the triangle touches the tile. There are 2 cases:
			// - The tile is fully covered by the triangle.
			// - The tile is partially covered by the triangle.
			//
			// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
			// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
			//
			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// The trivial accept corner is the opposite corner to the trivial reject corner.
			// If all trivial accept corners are inside their respective edges then the block is fully
			// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
			//
			// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
			// the block's max point.
			// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
			// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
			// (trivial accept) corner:
			//    trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
			//    trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
			//
			const int32_t w0_trivialAccept = w0_blockMin + trivialAcceptOffset0;
			const int32_t w1_trivialAccept = w1_blockMin + trivialAcceptOffset1;
			const int32_t w2_trivialAccept = w2_blockMin + trivialAcceptOffset2;
			if (SWR_ANY_NEGATIVE3(w0_trivialAccept, w1_trivialAccept, w2_trivialAccept)) {
				// Partial block
				int32_t w0_row = w0_blockMin;
				int32_t w1_row = w1_blockMin;
				int32_t w2_row = w2_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					const int32_t w0_rowTrivialReject = w0_row + trivialRejectOffset0_dx;
					const int32_t w1_rowTrivialReject = w1_row + trivialRejectOffset1_dx;
					const int32_t w2_rowTrivialReject = w2_row + trivialRejectOffset2_dx;
					if (!SWR_ANY_NEGATIVE3(w0_rowTrivialReject, w1_rowTrivialReject, w2_rowTrivialReject)) {
						int32_t w0 = w0_row;
						int32_t w1 = w1_row;
						int32_t w2 = w2_row;

						for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
							if (!SWR_ANY_NEGATIVE3(w0, w1, w2)) {
								assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
								const uint32_t rgba = 0xFFFFFFFF;
#else
								const float l0 = (float)w0 * inv_area;
								const float l1 = (float)w1 * inv_area;

								// l2 = 1.0f - (l0 + l1)
								//
								// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
								// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
								// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
								// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
								//
								// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
								const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
								const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
								const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
								const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
								const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
								fb_row[px] = rgba;
							}

							w0 += edge0.m_dx;
							w1 += edge1.m_dx;
							w2 += edge2.m_dx;
						}
					}

					w0_row += edge0.m_dy;
					w1_row += edge1.m_dy;
					w2_row += edge2.m_dy;
					fb_row += ctx->m_Width;
				}
			} else {
				// Full block
				int32_t w0_row = w0_blockMin;
				int32_t w1_row = w1_blockMin;
				int32_t w2_row = w2_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					// Calculate barycentric coords at pxmin
					int32_t w0 = w0_row;
					int32_t w1 = w1_row;
					int32_t w2 = w2_row;

					for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
							assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
							const uint32_t rgba = 0xFFFFFFFF;
#else
							const float l0 = (float)w0 * inv_area;
							const float l1 = (float)w1 * inv_area;

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
							const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
							const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
							const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
							const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
							fb_row[px] = rgba;
						}

						w0 += edge0.m_dx;
						w1 += edge1.m_dx;
						w2 += edge2.m_dx;
					}

					w0_row += edge0.m_dy;
					w1_row += edge1.m_dy;
					w2_row += edge2.m_dy;
					fb_row += ctx->m_Width;
				}
			}

			w0_blockMin += w0_nextBlock_dx;
			w1_blockMin += w1_nextBlock_dx;
			w2_blockMin += w2_nextBlock_dx;
		}

		w0_blockY += w0_nextBlock_dy;
		w1_blockY += w1_nextBlock_dy;
		w2_blockY += w2_nextBlock_dy;
	}
}

static void swrDrawTriangleSSE2_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);

	const vec4f v_r2 = vec4f_getXXXX(v_c2);
	const vec4f v_g2 = vec4f_getYYYY(v_c2);
	const vec4f v_b2 = vec4f_getZZZZ(v_c2);
	const vec4f v_a2 = vec4f_getWWWW(v_c2);
	const vec4f v_dr02 = vec4f_getXXXX(v_c02);
	const vec4f v_dg02 = vec4f_getYYYY(v_c02);
	const vec4f v_db02 = vec4f_getZZZZ(v_c02);
	const vec4f v_da02 = vec4f_getWWWW(v_c02);
	const vec4f v_dr12 = vec4f_getXXXX(v_c12);
	const vec4f v_dg12 = vec4f_getYYYY(v_c12);
	const vec4f v_db12 = vec4f_getZZZZ(v_c12);
	const vec4f v_da12 = vec4f_getWWWW(v_c12);
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Trivial reject/accept corner offsets relative to block min/max.
	const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1);
	const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1);
	const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1);
	const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1);
	const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1);
	const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1);
	const vec4i v_trivialRejectOffset = vec4i_fromInt4(
		(edge0.m_dx >= 0 ? w0_blockMax_dx : 0) + (edge0.m_dy >= 0 ? w0_blockMax_dy : 0),
		(edge1.m_dx >= 0 ? w1_blockMax_dx : 0) + (edge1.m_dy >= 0 ? w1_blockMax_dy : 0),
		(edge2.m_dx >= 0 ? w2_blockMax_dx : 0) + (edge2.m_dy >= 0 ? w2_blockMax_dy : 0),
		0
	);

	const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_fromInt4(
		(w0_blockMax_dx + w0_blockMax_dy),
		(w1_blockMax_dx + w1_blockMax_dy),
		(w2_blockMax_dx + w2_blockMax_dy),
		0), v_trivialRejectOffset
	);

	const vec4i v_trivialRejectOffset_dx = vec4i_fromInt4(
		(edge0.m_dx >= 0 ? w0_blockMax_dx : 0),
		(edge1.m_dx >= 0 ? w1_blockMax_dx : 0),
		(edge2.m_dx >= 0 ? w2_blockMax_dx : 0),
		0
	);

	const vec4i v_edge0_dx0123 = vec4i_fromInt4(0, edge0.m_dx, edge0.m_dx * 2, edge0.m_dx * 3);
	const vec4i v_edge1_dx0123 = vec4i_fromInt4(0, edge1.m_dx, edge1.m_dx * 2, edge1.m_dx * 3);
	const vec4i v_edge2_dx0123 = vec4i_fromInt4(0, edge2.m_dx, edge2.m_dx * 2, edge2.m_dx * 3);
	const vec4i v_edge0_dx4 = vec4i_fromInt(edge0.m_dx * 4);
	const vec4i v_edge1_dx4 = vec4i_fromInt(edge1.m_dx * 4);
	const vec4i v_edge2_dx4 = vec4i_fromInt(edge2.m_dx * 4);
	const vec4i v_edge012__dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);

	// Rasterize
	const vec4i v_w_bboxMin = vec4i_fromInt4(
		swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned),
		swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned),
		swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned),
		0
	);

	const vec4i v_w_nextBlock_dx = vec4i_fromInt4(edge0.m_dx * kBlockSize, edge1.m_dx * kBlockSize, edge2.m_dx * kBlockSize, 0);
	const vec4i v_w_nextBlock_dy = vec4i_fromInt4(edge0.m_dy * kBlockSize, edge1.m_dy * kBlockSize, edge2.m_dy * kBlockSize, 0);

	vec4i v_w_blockY = v_w_bboxMin;
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
		vec4i v_w_blockMin = v_w_blockY;
		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset);
			if (vec4i_any_neg_SSE2(v_w_trivialReject)) {
				v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
				continue;
			}

			// At this point we know that the triangle touches the tile. There are 2 cases:
			// - The tile is fully covered by the triangle.
			// - The tile is partially covered by the triangle.
			//
			// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
			// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
			//
			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// The trivial accept corner is the opposite corner to the trivial reject corner.
			// If all trivial accept corners are inside their respective edges then the block is fully
			// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
			//
			// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
			// the block's max point.
			// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
			// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
			// (trivial accept) corner:
			//    trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
			//    trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
			//
			const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset);
			if (vec4i_any_neg_SSE2(v_w_trivialAccept)) {
				// Partial block
				vec4i v_w_row = v_w_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					const vec4i v_w_rowTrivialReject = vec4i_add(v_w_row, v_trivialRejectOffset_dx);
					if (!vec4i_any_neg_SSE2(v_w_rowTrivialReject))
					{
						vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
						vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
						vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);

						for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
							// Calculate the (inverse) pixel mask.
							// If any of the barycentric coordinates is negative, the pixel mask will
							// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
							// to blend between the existing framebuffer values and the new values.
							const vec4i v_izero = vec4i_zero();
							const vec4i v_w0_lt = vec4i_cmplt(v_w0, v_izero);
							const vec4i v_w1_lt = vec4i_cmplt(v_w1, v_izero);
							const vec4i v_w2_lt = vec4i_cmplt(v_w2, v_izero);
							const vec4i v_notPixelMask = vec4i_or(v_w0_lt, vec4i_or(v_w1_lt, v_w2_lt));

							{
#if SWR_CONFIG_NO_PIXEL_SHADER
								const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
								const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
								const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);

								// l2 = 1.0f - (l0 + l1)
								//
								// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
								// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
								// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
								// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
								//
								// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
								const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
								const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
								const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
								const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));

								// Pack independent R32/G32/B32/A32 values of the 4 pixels into RGBA8.
								const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif

								// Store result using the pixel mask
								vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[px]);
							}

							v_w0 = vec4i_add(v_w0, v_edge0_dx4);
							v_w1 = vec4i_add(v_w1, v_edge1_dx4);
							v_w2 = vec4i_add(v_w2, v_edge2_dx4);
						}
					}

					v_w_row = vec4i_add(v_w_row, v_edge012__dy);
					fb_row += ctx->m_Width;
				}
			} else {
				// Full block
				vec4i v_w_row = v_w_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					// Calculate barycentric coords at pxmin
					vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
					vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
					vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);

					for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
#if SWR_CONFIG_NO_PIXEL_SHADER
							const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
							const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
							const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
							const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
							const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
							const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));

							const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
							vec4i_toInt4va(v_rgba8, &fb_row[px]);
						}

						v_w0 = vec4i_add(v_w0, v_edge0_dx4);
						v_w1 = vec4i_add(v_w1, v_edge1_dx4);
						v_w2 = vec4i_add(v_w2, v_edge2_dx4);
					}

					v_w_row = vec4i_add(v_w_row, v_edge012__dy);
					fb_row += ctx->m_Width;
				}
			}

			v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
		}

		v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy);
	}
}

static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 4);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 4);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);

	const vec4f v_r2 = vec4f_getXXXX(v_c2);
	const vec4f v_g2 = vec4f_getYYYY(v_c2);
	const vec4f v_b2 = vec4f_getZZZZ(v_c2);
	const vec4f v_a2 = vec4f_getWWWW(v_c2);
	const vec4f v_dr02 = vec4f_getXXXX(v_c02);
	const vec4f v_dg02 = vec4f_getYYYY(v_c02);
	const vec4f v_db02 = vec4f_getZZZZ(v_c02);
	const vec4f v_da02 = vec4f_getWWWW(v_c02);
	const vec4f v_dr12 = vec4f_getXXXX(v_c12);
	const vec4f v_dg12 = vec4f_getYYYY(v_c12);
	const vec4f v_db12 = vec4f_getZZZZ(v_c12);
	const vec4f v_da12 = vec4f_getWWWW(v_c12);

	// Barycentric coordinate normalization
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Trivial reject/accept corner offsets relative to block min/max.
	const vec4i v_zero = vec4i_zero();
	const vec4i v_blockSize = vec4i_fromInt(4);
	const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
	const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
	const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
	const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);

	const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
	const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);
	const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
	const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);

	const vec4i v_trivialRejectOffset = vec4i_add(
		vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
		vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
	);

	const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);

	const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
	const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
	const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);

	// Rasterize
	const vec4i v_w_bboxMin = vec4i_fromInt4(
		swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned),
		swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned),
		swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned),
		0
	);

	const vec4i v_w_nextBlock_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize);
	const vec4i v_w_nextBlock_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize);

	vec4i v_w_blockY = v_w_bboxMin;
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += 4) {
		vec4i v_w_blockMin = v_w_blockY;
		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += 4) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset);
			if (vec4i_any_neg_SSE2(v_w_trivialReject)) {
				v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
				continue;
			}

			// At this point we know that the triangle touches the tile. There are 2 cases:
			// - The tile is fully covered by the triangle.
			// - The tile is partially covered by the triangle.
			//
			// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
			// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
			//
			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// The trivial accept corner is the opposite corner to the trivial reject corner.
			// If all trivial accept corners are inside their respective edges then the block is fully
			// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
			//
			// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
			// the block's max point.
			// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
			// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
			// (trivial accept) corner:
			//    trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
			//    trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
			//
			const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset);
			if (vec4i_any_neg_SSE2(v_w_trivialAccept)) {
				// Partial block
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
				vec4i v_w0_row0 = vec4i_add(vec4i_getXXXX(v_w_blockMin), v_edge0_dx0123);
				vec4i v_w1_row0 = vec4i_add(vec4i_getYYYY(v_w_blockMin), v_edge1_dx0123);
				vec4i v_w2_row0 = vec4i_add(vec4i_getZZZZ(v_w_blockMin), v_edge2_dx0123);
				vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
				vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
				vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
				vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
				vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
				vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
				vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
				vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
				vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);

				// Calculate the (inverse) pixel mask.
				// If any of the barycentric coordinates is negative, the pixel mask will
				// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
				// to blend between the existing framebuffer values and the new values.
				const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
				const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
				const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
				const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);

				if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
					const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
					const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
					const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
					const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
					const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
					const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
					const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
					const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
					vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]);
				}

				if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
					const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
					const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
					const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
					const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
					const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
					const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
					const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
					const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
					vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
				}

				if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
					const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
					const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
					const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
					const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
					const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
					const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
					const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
					const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
					vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
				}

				if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
					const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
					const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
					const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
					const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
					const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
					const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
					const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
					const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
					vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
				}
			} else {
#if 1
				// Full block
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

#if !SWR_CONFIG_NO_PIXEL_SHADER
				const vec4i v_w0_row0 = vec4i_add(vec4i_getXXXX(v_w_blockMin), v_edge0_dx0123);
				const vec4i v_w1_row0 = vec4i_add(vec4i_getYYYY(v_w_blockMin), v_edge1_dx0123);
				const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
				const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
				const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
				const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
				const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
				const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif

#if SWR_CONFIG_NO_PIXEL_SHADER
				const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
				const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
				const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
				const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
				const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
				const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
				const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
				const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
				vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);

#if SWR_CONFIG_NO_PIXEL_SHADER
				const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
				const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
				const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
				const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
				const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
				const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
				const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
				const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
				vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);

#if SWR_CONFIG_NO_PIXEL_SHADER
				const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
				const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
				const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
				const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
				const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
				const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
				const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
				const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
				vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);

#if SWR_CONFIG_NO_PIXEL_SHADER
				const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
				const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
				const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
				const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
				const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
				const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
				const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
				const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
				vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
#endif
			}

			v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
		}

		v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy);
	}
}

static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);

	const vec4f v_r2 = vec4f_getXXXX(v_c2);
	const vec4f v_g2 = vec4f_getYYYY(v_c2);
	const vec4f v_b2 = vec4f_getZZZZ(v_c2);
	const vec4f v_a2 = vec4f_getWWWW(v_c2);
	const vec4f v_dr02 = vec4f_getXXXX(v_c02);
	const vec4f v_dg02 = vec4f_getYYYY(v_c02);
	const vec4f v_db02 = vec4f_getZZZZ(v_c02);
	const vec4f v_da02 = vec4f_getWWWW(v_c02);
	const vec4f v_dr12 = vec4f_getXXXX(v_c12);
	const vec4f v_dg12 = vec4f_getYYYY(v_c12);
	const vec4f v_db12 = vec4f_getZZZZ(v_c12);
	const vec4f v_da12 = vec4f_getWWWW(v_c12);

	// Barycentric coordinate normalization
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Trivial reject/accept corner offsets relative to block min/max.
	const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
	const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);

	const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
	const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
	const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);

	const vec4i v_zero = vec4i_zero();
	const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
	const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);

	const vec4i v_trivialRejectOffset = vec4i_add(
		vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
		vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
	);
	const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset);
	const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset);
	const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset);

	const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
	const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset);
	const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset);
	const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset);

	const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
	const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
	const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
	const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);

	// Rasterize
	const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned));
	const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned));
	const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned));

	const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16);
	const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16);
	const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16);
	const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4);
	const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4);
	const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4);

	const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12);
	vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets));
	vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets));
	vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets));
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) {
		uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width];
		vec4i v_w0_blockMin = v_w0_blockY;
		vec4i v_w1_blockMin = v_w1_blockY;
		vec4i v_w2_blockMin = v_w2_blockY;

		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0);
			const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1);
			const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2);
			const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject);
			uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F;
			if (trivialRejectBlockMask == 0) {
				v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
				v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
				v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
				continue;
			}

			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// If the trivial accept corner of all edges is positive (inside the edge) then the triangle
			// fully covers the block.
			const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0);
			const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1);
			const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2);
			const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept);
			uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept);

			int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4];
			vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]);
			vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]);
			vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]);

			for (uint32_t iBlock = 0;trivialRejectBlockMask != 0;
				++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1)
			{
				if ((trivialRejectBlockMask & 1) == 0) {
					continue;
				}

				uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4];
				vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123);
				vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123);

				if ((trivialAcceptBlockMask & 1) != 0) {
					// Partial block
					vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123);
					vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
					vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
					vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
					vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
					vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
					vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
					vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
					vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
					vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);

					// Calculate the (inverse) pixel mask.
					// If any of the barycentric coordinates is negative, the pixel mask will
					// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
					// to blend between the existing framebuffer values and the new values.
					const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
					const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
					const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
					const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);

					// Row 0
					if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]);
					}

					// Row 1
					if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
					}

					// Row 2
					if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
					}

					// Row 3
					if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
					}
				} else {
					// Full block
#if !SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
					const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
					const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
					const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
					const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
					const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif

					// Row 0
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
						const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
						const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
						const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
						const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
						const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
						const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
						vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
					}

					// Row 1
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
						const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
						const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
						const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
						const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
						const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
						const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
						vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
					}

					// Row 2
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
						const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
						const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
						const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
						const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
						const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
						const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
						vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
					}

					// Row 3
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
						const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
						const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
						const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
						const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
						const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
						const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
						vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
					}
				}
			}

			v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
			v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
			v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
		}

		v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy);
		v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy);
		v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy);
	}
}

// Same as the corresponding SSE2 version except from the usage of pshufb
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);

	const vec4f v_r2 = vec4f_getXXXX(v_c2);
	const vec4f v_g2 = vec4f_getYYYY(v_c2);
	const vec4f v_b2 = vec4f_getZZZZ(v_c2);
	const vec4f v_a2 = vec4f_getWWWW(v_c2);
	const vec4f v_dr02 = vec4f_getXXXX(v_c02);
	const vec4f v_dg02 = vec4f_getYYYY(v_c02);
	const vec4f v_db02 = vec4f_getZZZZ(v_c02);
	const vec4f v_da02 = vec4f_getWWWW(v_c02);
	const vec4f v_dr12 = vec4f_getXXXX(v_c12);
	const vec4f v_dg12 = vec4f_getYYYY(v_c12);
	const vec4f v_db12 = vec4f_getZZZZ(v_c12);
	const vec4f v_da12 = vec4f_getWWWW(v_c12);
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Trivial reject/accept corner offsets relative to block min/max.
	const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1);
	const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1);
	const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1);
	const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1);
	const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1);
	const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1);
	const vec4i v_trivialRejectOffset = vec4i_fromInt4(
		(edge0.m_dx >= 0 ? w0_blockMax_dx : 0) + (edge0.m_dy >= 0 ? w0_blockMax_dy : 0),
		(edge1.m_dx >= 0 ? w1_blockMax_dx : 0) + (edge1.m_dy >= 0 ? w1_blockMax_dy : 0),
		(edge2.m_dx >= 0 ? w2_blockMax_dx : 0) + (edge2.m_dy >= 0 ? w2_blockMax_dy : 0),
		0
	);

	const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_fromInt4(
		(w0_blockMax_dx + w0_blockMax_dy),
		(w1_blockMax_dx + w1_blockMax_dy),
		(w2_blockMax_dx + w2_blockMax_dy),
		0), v_trivialRejectOffset
	);

	const vec4i v_trivialRejectOffset_dx = vec4i_fromInt4(
		(edge0.m_dx >= 0 ? w0_blockMax_dx : 0),
		(edge1.m_dx >= 0 ? w1_blockMax_dx : 0),
		(edge2.m_dx >= 0 ? w2_blockMax_dx : 0),
		0
	);

	const vec4i v_edge0_dx0123 = vec4i_fromInt4(0, edge0.m_dx, edge0.m_dx * 2, edge0.m_dx * 3);
	const vec4i v_edge1_dx0123 = vec4i_fromInt4(0, edge1.m_dx, edge1.m_dx * 2, edge1.m_dx * 3);
	const vec4i v_edge2_dx0123 = vec4i_fromInt4(0, edge2.m_dx, edge2.m_dx * 2, edge2.m_dx * 3);
	const vec4i v_edge0_dx4 = vec4i_fromInt(edge0.m_dx * 4);
	const vec4i v_edge1_dx4 = vec4i_fromInt(edge1.m_dx * 4);
	const vec4i v_edge2_dx4 = vec4i_fromInt(edge2.m_dx * 4);
	const vec4i v_edge012__dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);

	// Rasterize
	const vec4i v_w_bboxMin = vec4i_fromInt4(
		swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned),
		swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned),
		swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned),
		0
	);

	const vec4i v_w_nextBlock_dx = vec4i_fromInt4(edge0.m_dx * kBlockSize, edge1.m_dx * kBlockSize, edge2.m_dx * kBlockSize, 0);
	const vec4i v_w_nextBlock_dy = vec4i_fromInt4(edge0.m_dy * kBlockSize, edge1.m_dy * kBlockSize, edge2.m_dy * kBlockSize, 0);

	vec4i v_w_blockY = v_w_bboxMin;
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
		vec4i v_w_blockMin = v_w_blockY;
		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset);
			if (vec4i_any_neg_SSE2(v_w_trivialReject)) {
				v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
				continue;
			}

			// At this point we know that the triangle touches the tile. There are 2 cases:
			// - The tile is fully covered by the triangle.
			// - The tile is partially covered by the triangle.
			//
			// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
			// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
			//
			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// The trivial accept corner is the opposite corner to the trivial reject corner.
			// If all trivial accept corners are inside their respective edges then the block is fully
			// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
			//
			// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
			// the block's max point.
			// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
			// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
			// (trivial accept) corner:
			//    trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
			//    trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
			//
			const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset);
			if (vec4i_any_neg_SSE2(v_w_trivialAccept)) {
				// Partial block
				vec4i v_w_row = v_w_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					const vec4i v_w_rowTrivialReject = vec4i_add(v_w_row, v_trivialRejectOffset_dx);
					if (!vec4i_any_neg_SSE2(v_w_rowTrivialReject)) {
						vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
						vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
						vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);

						for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
							// Calculate the (inverse) pixel mask.
							// If any of the barycentric coordinates is negative, the pixel mask will
							// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
							// to blend between the existing framebuffer values and the new values.
							const vec4i v_izero = vec4i_zero();
							const vec4i v_w0_lt = vec4i_cmplt(v_w0, v_izero);
							const vec4i v_w1_lt = vec4i_cmplt(v_w1, v_izero);
							const vec4i v_w2_lt = vec4i_cmplt(v_w2, v_izero);
							const vec4i v_notPixelMask = vec4i_or(v_w0_lt, vec4i_or(v_w1_lt, v_w2_lt));

							{
#if SWR_CONFIG_NO_PIXEL_SHADER
								const vec4i v_oldFB = vec4i_fromInt4va(&fb_row[px]);
								const vec4i v_newFB = vec4i_or(vec4i_and(v_notPixelMask, v_oldFB), vec4i_andnot(v_notPixelMask, vec4i_fromInt(-1)));
								vec4i_toInt4va(v_newFB, &fb_row[px]);
#else
								const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
								const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);

								// l2 = 1.0f - (l0 + l1)
								//
								// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
								// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
								// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
								// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
								//
								// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
								const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
								const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
								const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
								const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));

								// Pack independent R32/G32/B32/A32 values of the 4 pixels into RGBA8.
								const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);

								// Store result using the pixel mask
								const vec4i v_newFB = vec4i_or(
									vec4i_and(v_notPixelMask, vec4i_fromInt4va(&fb_row[px])),
									vec4i_andnot(v_notPixelMask, v_rgba8)
								);
								vec4i_toInt4va(v_newFB, &fb_row[px]);
#endif
							}

							v_w0 = vec4i_add(v_w0, v_edge0_dx4);
							v_w1 = vec4i_add(v_w1, v_edge1_dx4);
							v_w2 = vec4i_add(v_w2, v_edge2_dx4);
						}
					}

					v_w_row = vec4i_add(v_w_row, v_edge012__dy);
					fb_row += ctx->m_Width;
				}
			} else {
				// Full block
				vec4i v_w_row = v_w_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					// Calculate barycentric coords at pxmin
					vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
					vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
					vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);

					for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
#if SWR_CONFIG_NO_PIXEL_SHADER
							vec4i_toInt4va(vec4i_fromInt(-1), &fb_row[px]);
#else
							const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
							const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
							const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
							const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
							const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));

							const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
							vec4i_toInt4va(v_rgba8, &fb_row[px]);
#endif
						}

						v_w0 = vec4i_add(v_w0, v_edge0_dx4);
						v_w1 = vec4i_add(v_w1, v_edge1_dx4);
						v_w2 = vec4i_add(v_w2, v_edge2_dx4);
					}

					v_w_row = vec4i_add(v_w_row, v_edge012__dy);
					fb_row += ctx->m_Width;
				}
			}

			v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
		}

		v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy);
	}
}

static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);

	const vec4f v_r2 = vec4f_getXXXX(v_c2);
	const vec4f v_g2 = vec4f_getYYYY(v_c2);
	const vec4f v_b2 = vec4f_getZZZZ(v_c2);
	const vec4f v_a2 = vec4f_getWWWW(v_c2);
	const vec4f v_dr02 = vec4f_getXXXX(v_c02);
	const vec4f v_dg02 = vec4f_getYYYY(v_c02);
	const vec4f v_db02 = vec4f_getZZZZ(v_c02);
	const vec4f v_da02 = vec4f_getWWWW(v_c02);
	const vec4f v_dr12 = vec4f_getXXXX(v_c12);
	const vec4f v_dg12 = vec4f_getYYYY(v_c12);
	const vec4f v_db12 = vec4f_getZZZZ(v_c12);
	const vec4f v_da12 = vec4f_getWWWW(v_c12);

	// Barycentric coordinate normalization
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Trivial reject/accept corner offsets relative to block min/max.
	const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
	const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);

	const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
	const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
	const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);

	const vec4i v_zero = vec4i_zero();
	const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
	const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);

	const vec4i v_trivialRejectOffset = vec4i_add(
		vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
		vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
	);
	const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset);
	const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset);
	const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset);

	const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
	const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset);
	const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset);
	const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset);

	const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
	const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
	const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
	const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);

	// Rasterize
	const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned));
	const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned));
	const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned));

	const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16);
	const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16);
	const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16);
	const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4);
	const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4);
	const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4);

	const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12);
	vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets));
	vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets));
	vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets));
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) {
		uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width];
		vec4i v_w0_blockMin = v_w0_blockY;
		vec4i v_w1_blockMin = v_w1_blockY;
		vec4i v_w2_blockMin = v_w2_blockY;

		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0);
			const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1);
			const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2);
			const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject);
			uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F;
			if (trivialRejectBlockMask == 0) {
				v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
				v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
				v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
				continue;
			}

			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// If the trivial accept corner of all edges is positive (inside the edge) then the triangle
			// fully covers the block.
			const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0);
			const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1);
			const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2);
			const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept);
			uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept);

			int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4];
			vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]);
			vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]);
			vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]);

			for (uint32_t iBlock = 0; trivialRejectBlockMask != 0;
				++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) {
				if ((trivialRejectBlockMask & 1) == 0) {
					continue;
				}

				uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4];
				vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123);
				vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123);

				if ((trivialAcceptBlockMask & 1) != 0) {
					// Partial block
					vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123);
					vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
					vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
					vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
					vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
					vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
					vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
					vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
					vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
					vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);

					// Calculate the (inverse) pixel mask.
					// If any of the barycentric coordinates is negative, the pixel mask will
					// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
					// to blend between the existing framebuffer values and the new values.
					const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
					const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
					const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
					const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);

					// Row 0
					if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]);
					}

					// Row 1
					if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
					}

					// Row 2
					if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
					}

					// Row 3
					if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
					}
				} else {
					// Full block
#if !SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
					const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
					const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
					const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
					const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
					const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif

					// Row 0
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
						const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
						const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
						const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
						const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
						const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
						const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
						vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
					}

					// Row 1
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
						const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
						const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
						const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
						const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
						const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
						const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
						vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
					}

					// Row 2
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
						const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
						const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
						const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
						const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
						const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
						const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
						vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
					}

					// Row 3
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
						const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
						const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
						const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
						const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
						const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
						const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
						vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
					}
				}
			}

			v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
			v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
			v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
		}

		v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy);
		v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy);
		v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy);
	}
}

static void swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);

	const vec4f v_r2 = vec4f_getXXXX(v_c2);
	const vec4f v_g2 = vec4f_getYYYY(v_c2);
	const vec4f v_b2 = vec4f_getZZZZ(v_c2);
	const vec4f v_a2 = vec4f_getWWWW(v_c2);
	const vec4f v_dr02 = vec4f_getXXXX(v_c02);
	const vec4f v_dg02 = vec4f_getYYYY(v_c02);
	const vec4f v_db02 = vec4f_getZZZZ(v_c02);
	const vec4f v_da02 = vec4f_getWWWW(v_c02);
	const vec4f v_dr12 = vec4f_getXXXX(v_c12);
	const vec4f v_dg12 = vec4f_getYYYY(v_c12);
	const vec4f v_db12 = vec4f_getZZZZ(v_c12);
	const vec4f v_da12 = vec4f_getWWWW(v_c12);

	// Barycentric coordinate normalization
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Trivial reject/accept corner offsets relative to block min/max.
	const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
	const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);

	const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
	const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
	const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);

	const vec4i v_zero = vec4i_zero();
	const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
	const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);

	const vec4i v_trivialRejectOffset = vec4i_add(
		vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
		vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
	);
	const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset);
	const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset);
	const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset);

	const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
	const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset);
	const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset);
	const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset);

	const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
	const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
	const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
	const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
	const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);

	// Rasterize
	const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned));
	const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned));
	const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned));

	const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16);
	const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16);
	const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16);
	const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4);
	const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4);
	const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4);

	const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12);
	vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets));
	vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets));
	vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets));
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) {
		uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width];
		vec4i v_w0_blockMin = v_w0_blockY;
		vec4i v_w1_blockMin = v_w1_blockY;
		vec4i v_w2_blockMin = v_w2_blockY;

		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0);
			const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1);
			const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2);
			const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject);
			uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F;
			if (trivialRejectBlockMask == 0) {
				v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
				v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
				v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
				continue;
			}

			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// If the trivial accept corner of all edges is positive (inside the edge) then the triangle
			// fully covers the block.
			const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0);
			const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1);
			const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2);
			const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept);
			uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept);

			int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4];
			vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]);
			vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]);
			vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]);

			for (uint32_t iBlock = 0; trivialRejectBlockMask != 0;
				++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) {
				if ((trivialRejectBlockMask & 1) == 0) {
					continue;
				}

				uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4];
				vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123);
				vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123);

				if ((trivialAcceptBlockMask & 1) != 0) {
					// Partial block
					vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123);
					vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
					vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
					vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
					vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
					vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
					vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
					vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
					vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
					vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);

					// Calculate the (inverse) pixel mask.
					// If any of the barycentric coordinates is negative, the pixel mask will
					// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
					// to blend between the existing framebuffer values and the new values.
					const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
					const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
					const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
					const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);

					// Row 0
					if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[0]);
					}

					// Row 1
					if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
					}

					// Row 2
					if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
					}

					// Row 3
					if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
						const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
						const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
						const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
						const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
						const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
						const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
						const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
						const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
						vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
					}
				} else {
					// Full block
#if !SWR_CONFIG_NO_PIXEL_SHADER
					const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
					const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
					const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
					const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
					const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
					const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif

					// Row 0
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
						const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
						const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
						const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
						const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
						const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
						const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
						vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
					}

					// Row 1
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
						const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
						const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
						const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
						const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
						const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
						const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
						vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
					}

					// Row 2
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
						const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
						const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
						const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
						const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
						const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
						const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
						vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
					}

					// Row 3
					{
#if SWR_CONFIG_NO_PIXEL_SHADER
						const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
						const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
						const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
						const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
						const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
						const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
						const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
						const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
						vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
					}
				}
			}

			v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
			v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
			v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
		}

		v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy);
		v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy);
		v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy);
	}
}

// 2-level hierarchical rasterization using trivial reject/accept corners.
//
// Similar to swrDrawTriangleRef_HierarchicalLRB_Cond() but for each partially covered block
// the range of valid rows is calculated and iterated and for each touched row the range
// of valid pixels/cols is calculated and iterated. This way there is no need for conditionals
// inside the inner-most loop and the 3 barycentric coordinates are always greater than or equal
// to 0.
//
// Hard to vectorize because the valid row/col calculations are scalar.
//
// Even though there are multiple difficult to predict branches inside the inner loops this
// seems to be marginally faster than the reference _NoCond() function.
#if 1
static void swrDrawTriangleRef_HierarchicalLRB_NoCond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
	const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
	const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
	const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
	const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
	const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
	const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
	const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
	const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
	const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
	const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
	const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
	const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
	const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
	const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
	const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
	const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif

	// Triangle setup
	const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
	const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
	const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const float inv_area = 1.0f / (float)iarea;
#endif

	// Trivial reject/accept corner offsets relative to block min/max.
	const int32_t trivialRejectOffset0 = 0
		+ (edge0.m_dx >= 0 ? edge0.m_dx * (kBlockSize - 1) : 0)
		+ (edge0.m_dy >= 0 ? edge0.m_dy * (kBlockSize - 1) : 0)
		;
	const int32_t trivialRejectOffset1 = 0
		+ (edge1.m_dx >= 0 ? edge1.m_dx * (kBlockSize - 1) : 0)
		+ (edge1.m_dy >= 0 ? edge1.m_dy * (kBlockSize - 1) : 0)
		;
	const int32_t trivialRejectOffset2 = 0
		+ (edge2.m_dx >= 0 ? edge2.m_dx * (kBlockSize - 1) : 0)
		+ (edge2.m_dy >= 0 ? edge2.m_dy * (kBlockSize - 1) : 0)
		;

	const int32_t trivialAcceptOffset0 = (edge0.m_dx + edge0.m_dy) * (kBlockSize - 1) - trivialRejectOffset0;
	const int32_t trivialAcceptOffset1 = (edge1.m_dx + edge1.m_dy) * (kBlockSize - 1) - trivialRejectOffset1;
	const int32_t trivialAcceptOffset2 = (edge2.m_dx + edge2.m_dy) * (kBlockSize - 1) - trivialRejectOffset2;

	const int32_t trivialRejectOffset0_dx = 0
		+ (edge0.m_dx >= 0 ? edge0.m_dx * (kBlockSize - 1) : 0)
		;
	const int32_t trivialRejectOffset1_dx = 0
		+ (edge1.m_dx >= 0 ? edge1.m_dx * (kBlockSize - 1) : 0)
		;
	const int32_t trivialRejectOffset2_dx = 0
		+ (edge2.m_dx >= 0 ? edge2.m_dx * (kBlockSize - 1) : 0)
		;

	// Rasterize
	const int32_t w0_bboxMin = swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned);
	const int32_t w1_bboxMin = swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned);
	const int32_t w2_bboxMin = swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned);

	const int32_t w0_block_dx = edge0.m_dx * kBlockSize;
	const int32_t w0_block_dy = edge0.m_dy * kBlockSize;
	const int32_t w1_block_dx = edge1.m_dx * kBlockSize;
	const int32_t w1_block_dy = edge1.m_dy * kBlockSize;
	const int32_t w2_block_dx = edge2.m_dx * kBlockSize;
	const int32_t w2_block_dy = edge2.m_dy * kBlockSize;

	int32_t w0_blockY = w0_bboxMin;
	int32_t w1_blockY = w1_bboxMin;
	int32_t w2_blockY = w2_bboxMin;
	for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
		int32_t w0_blockMin = w0_blockY;
		int32_t w1_blockMin = w1_blockY;
		int32_t w2_blockMin = w2_blockY;
		for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
			// Evaluate each edge function at its trivial reject corner (the most positive block corner).
			// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
			// does not touch the block.
			const int32_t w0_trivialReject = w0_blockMin + trivialRejectOffset0;
			const int32_t w1_trivialReject = w1_blockMin + trivialRejectOffset1;
			const int32_t w2_trivialReject = w2_blockMin + trivialRejectOffset2;
			if (SWR_ANY_NEGATIVE3(w0_trivialReject, w1_trivialReject, w2_trivialReject)) {
				w0_blockMin += w0_block_dx;
				w1_blockMin += w1_block_dx;
				w2_blockMin += w2_block_dx;
				continue;
			}

			// At this point we know that the triangle touches the tile. There are 2 cases:
			// - The tile is fully covered by the triangle.
			// - The tile is partially covered by the triangle.
			//
			// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
			// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
			//
			// Evaluate each edge function at its trivial accept corner (the most negative block corner).
			// The trivial accept corner is the opposite corner to the trivial reject corner.
			// If all trivial accept corners are inside their respective edges then the block is fully
			// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
			//
			// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
			// the block's max point.
			// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
			// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
			// (trivial accept) corner:
			//    trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
			//    trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
			//
			const int32_t w0_trivialAccept = w0_blockMin + trivialAcceptOffset0;
			const int32_t w1_trivialAccept = w1_blockMin + trivialAcceptOffset1;
			const int32_t w2_trivialAccept = w2_blockMin + trivialAcceptOffset2;
			if (SWR_ANY_NEGATIVE3(w0_trivialAccept, w1_trivialAccept, w2_trivialAccept)) {
				// Partial block
				int32_t pymin = 0;
				int32_t pymax = kBlockSize - 1;

				if (w0_trivialAccept < 0) {
					// Evaluate 1st edge function at the 4 block corners. If all of the signed
					// distances are negative (all sign bits are 1) then the block will be empty.
					const int32_t w0_A = w0_blockMin;
					const int32_t w0_B = w0_blockMin + w0_block_dx;
					const int32_t w0_C = w0_blockMin + w0_block_dx + w0_block_dy;
					const int32_t w0_D = w0_blockMin + w0_block_dy;
					const uint32_t w0_blockMsk = SWR_BLOCK_MASK(w0_A, w0_B, w0_C, w0_D);
					assert(SWR_BLOCK_MASK_IS_VALID(w0_blockMsk));
					assert(w0_blockMsk != SWR_BLOCK_MASK_EMPTY);

					if (SWR_BLOCK_MASK_Y_MAX(w0_blockMsk)) {
						const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
						pymax = swr_mini(pymax, w_pymax);
					} else if (SWR_BLOCK_MASK_Y_MIN(w0_blockMsk)) {
						const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
						pymin = swr_maxi(pymin, w_pymin);
					}
				}

				if (w1_trivialAccept < 0) {
					// Evaluate 2nd edge function at the 4 block corners. If all of the signed
					// distances are negative (all sign bits are 1) then the block will be empty.
					const int32_t w1_A = w1_blockMin;
					const int32_t w1_B = w1_blockMin + w1_block_dx;
					const int32_t w1_C = w1_blockMin + w1_block_dx + w1_block_dy;
					const int32_t w1_D = w1_blockMin + w1_block_dy;
					const uint32_t w1_blockMsk = SWR_BLOCK_MASK(w1_A, w1_B, w1_C, w1_D);
					assert(SWR_BLOCK_MASK_IS_VALID(w1_blockMsk));
					assert(w1_blockMsk != SWR_BLOCK_MASK_EMPTY);

					if (SWR_BLOCK_MASK_Y_MAX(w1_blockMsk)) {
						const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
						pymax = swr_mini(pymax, w_pymax);
					} else if (SWR_BLOCK_MASK_Y_MIN(w1_blockMsk)) {
						const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
						pymin = swr_maxi(pymin, w_pymin);
					}
				}

				if (w2_trivialAccept < 0) {
					// Evaluate 3rd edge function at the 4 block corners. If all of the signed
					// distances are negative (all sign bits are 1) then the block will be empty.
					const int32_t w2_A = w2_blockMin;
					const int32_t w2_B = w2_blockMin + w2_block_dx;
					const int32_t w2_C = w2_blockMin + w2_block_dx + w2_block_dy;
					const int32_t w2_D = w2_blockMin + w2_block_dy;
					const uint32_t w2_blockMsk = SWR_BLOCK_MASK(w2_A, w2_B, w2_C, w2_D);
					assert(SWR_BLOCK_MASK_IS_VALID(w2_blockMsk));
					assert(w2_blockMsk != SWR_BLOCK_MASK_EMPTY);

					if (SWR_BLOCK_MASK_Y_MAX(w2_blockMsk)) {
						const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
						pymax = swr_mini(pymax, w_pymax);
					} else if (SWR_BLOCK_MASK_Y_MIN(w2_blockMsk)) {
						const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
						pymin = swr_maxi(pymin, w_pymin);
					}
				}

				// Evaluate edge functions at the first row.
				int32_t w0_blockMinX_py = w0_blockMin + edge0.m_dy * pymin;
				int32_t w1_blockMinX_py = w1_blockMin + edge1.m_dy * pymin;
				int32_t w2_blockMinX_py = w2_blockMin + edge2.m_dy * pymin;

				for (int32_t py = pymin; py <= pymax; ++py) {
					const int32_t w0_rowMsk = SWR_ROW_MASK(w0_blockMinX_py, w0_blockMinX_py + w0_block_dx);
					const int32_t w1_rowMsk = SWR_ROW_MASK(w1_blockMinX_py, w1_blockMinX_py + w1_block_dx);
					const int32_t w2_rowMsk = SWR_ROW_MASK(w2_blockMinX_py, w2_blockMinX_py + w2_block_dx);
					assert(w0_rowMsk != SWR_ROW_MASK_EMPTY);
					assert(w1_rowMsk != SWR_ROW_MASK_EMPTY);
					assert(w2_rowMsk != SWR_ROW_MASK_EMPTY);

					uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + (blockMinY + py) * ctx->m_Width];

					int32_t pxmin = 0;
					int32_t pxmax = (int32_t)kBlockSize - 1;

					if (!SWR_ROW_MASK_ALL_FULL(w0_rowMsk, w1_rowMsk, w2_rowMsk)) {
						if (SWR_ROW_MASK_X_MAX(w0_rowMsk)) {
							const int32_t w_pxmax = swr_idiv_floor(w0_blockMinX_py, -edge0.m_dx);
							pxmax = swr_mini(pxmax, w_pxmax);
						} else if (SWR_ROW_MASK_X_MIN(w0_rowMsk)) {
							const int32_t w_pxmin = swr_idiv_ceil(-w0_blockMinX_py, edge0.m_dx);
							pxmin = swr_maxi(pxmin, w_pxmin);
						}

						if (SWR_ROW_MASK_X_MAX(w1_rowMsk)) {
							const int32_t w_pxmax = swr_idiv_floor(w1_blockMinX_py, -edge1.m_dx);
							pxmax = swr_mini(pxmax, w_pxmax);
						} else if (SWR_ROW_MASK_X_MIN(w1_rowMsk)) {
							const int32_t w_pxmin = swr_idiv_ceil(-w1_blockMinX_py, edge1.m_dx);
							pxmin = swr_maxi(pxmin, w_pxmin);
						}

						if (SWR_ROW_MASK_X_MAX(w2_rowMsk)) {
							const int32_t w_pxmax = swr_idiv_floor(w2_blockMinX_py, -edge2.m_dx);
							pxmax = swr_mini(pxmax, w_pxmax);
						} else if (SWR_ROW_MASK_X_MIN(w2_rowMsk)) {
							const int32_t w_pxmin = swr_idiv_ceil(-w2_blockMinX_py, edge2.m_dx);
							pxmin = swr_maxi(pxmin, w_pxmin);
						}
					}

					// Calculate barycentric coords at pxmin
					int32_t w0 = w0_blockMinX_py + pxmin * edge0.m_dx;
					int32_t w1 = w1_blockMinX_py + pxmin * edge1.m_dx;
					int32_t w2 = w2_blockMinX_py + pxmin * edge2.m_dx;

					for (int32_t px = pxmin; px <= pxmax; ++px) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
							assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
							const uint32_t rgba = 0xFFFFFFFF;
#else
							const float l0 = (float)w0 * inv_area;
							const float l1 = (float)w1 * inv_area;

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
							const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
							const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
							const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
							const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif

							fb_row[px] = rgba;
						}

						w0 += edge0.m_dx;
						w1 += edge1.m_dx;
						w2 += edge2.m_dx;
					}

					w0_blockMinX_py += edge0.m_dy;
					w1_blockMinX_py += edge1.m_dy;
					w2_blockMinX_py += edge2.m_dy;
				}
			} else {
				// Full block
				int32_t w0_row = w0_blockMin;
				int32_t w1_row = w1_blockMin;
				int32_t w2_row = w2_blockMin;
				uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];

				for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
					// Calculate barycentric coords at pxmin
					int32_t w0 = w0_row;
					int32_t w1 = w1_row;
					int32_t w2 = w2_row;

					for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
						// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
						// Render the pixel
						{
							assert(w0 >= 0 && w1 >= 0 && w2 >= 0);

#if SWR_CONFIG_NO_PIXEL_SHADER
							const uint32_t rgba = 0xFFFFFFFF;
#else
							const float l0 = (float)w0 * inv_area;
							const float l1 = (float)w1 * inv_area;

							// l2 = 1.0f - (l0 + l1)
							//
							// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
							// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
							// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
							// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
							//
							// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
							const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
							const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
							const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
							const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
							const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif

							fb_row[px] = rgba;
						}

						w0 += edge0.m_dx;
						w1 += edge1.m_dx;
						w2 += edge2.m_dx;
					}

					w0_row += edge0.m_dy;
					w1_row += edge1.m_dy;
					w2_row += edge2.m_dy;
					fb_row += ctx->m_Width;
				}
			}

			w0_blockMin += w0_block_dx;
			w1_blockMin += w1_block_dx;
			w2_blockMin += w2_block_dx;
		}

		w0_blockY += w0_block_dy;
		w1_blockY += w1_block_dy;
		w2_blockY += w2_block_dy;
	}
}
#endif

// Initial SSE2 implementation based on swrDrawTriangle_Ref()
#if 1
static void swrDrawTriangleSSE2_Ref(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
	int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	// Compute triangle bounding box
	const int32_t minX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t minY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t maxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
	const int32_t maxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
	const int32_t bboxWidth = maxX - minX;
	const int32_t bboxHeight = maxY - minY;

	// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_c0 = vec4f_fromRGBA8(color0);
	const vec4f v_c1 = vec4f_fromRGBA8(color1);
	const vec4f v_c2 = vec4f_fromRGBA8(color2);
	const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
	const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
#endif

	// Triangle setup
	const vec4i v_w_px = vec4i_fromInt4(y1 - y2, y2 - y0, y0 - y1, 0);
	const vec4i v_w_py = vec4i_fromInt4(x2 - x1, x0 - x2, x1 - x0, 0);
	const vec4i v_w_c = vec4i_fromInt4(x1 * y2 - y1 * x2, x2 * y0 - y2 * x0, x0 * y1 - y0 * x1, 0);

	const vec4i v_minX = vec4i_fromInt(minX);
	const vec4i v_minY = vec4i_fromInt(minY);
	const vec4i v_w_pmin = vec4i_add(v_w_c, vec4i_add(vec4i_mullo_SSE2(v_w_px, v_minX), vec4i_mullo_SSE2(v_w_py, v_minY)));

	// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
	const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif

	// Rasterize
	vec4i v_w_row = v_w_pmin;
	uint32_t* fb_row = &ctx->m_FrameBuffer[minX + minY * ctx->m_Width];

	for (int32_t py = 0; py <= bboxHeight; ++py) {
		int32_t pxmin = 0;
		int32_t pxmax = bboxWidth;

		// Calculate the range of x values for which the barycentric coordinates
		// will always be greater than or equal to 0.
		{
			int32_t w_row[4];
			vec4i_toInt4vu(v_w_row, &w_row[0]);

			int32_t w_px[4];
			vec4i_toInt4vu(v_w_px, &w_px[0]);

			// The barycentric coordinates are linear functions: w_pmin + i * w_px
			//
			// The inequality w_pmin + i * w_px >= 0 holds for all i's in the range:
			// 1. w_pmin >= 0 && w_px >= 0 : [0, bboxWidth]
			// 2. w_pmin >= 0 && w_px < 0  : [0, imax]         where imax = -(w_pmin / w_px)
			// 3. w_pmin < 0  && w_px > 0  : [imin, bboxWidth] where imin = -(w_pmin / w_px) + 1
			// 4. w_pmin < 0  && w_px <= 0 : never
			//
			// From the 3 barycentric coordinates we have 3 equations. All of them
			// should be greater than or equal to 0 to draw a pixel.

			// Make sure we aren't in an invalid state.
			assert(!(w_row[0] < 0 && w_px[0] <= 0));
			assert(!(w_row[1] < 0 && w_px[1] <= 0));
			assert(!(w_row[2] < 0 && w_px[2] <= 0));

			// Calculate x range based on w0...
			if (w_row[0] >= 0 && w_px[0] < 0) {
				pxmax = swr_mini(pxmax, -(w_row[0] / w_px[0]));
			} else if (w_row[0] < 0 && w_px[0] > 0) {
				pxmin = swr_maxi(pxmin, -(w_row[0] / w_px[0]) + 1);
			}

			// Calculate x range based on w1...
			if (w_row[1] >= 0 && w_px[1] < 0) {
				pxmax = swr_mini(pxmax, -(w_row[1] / w_px[1]));
			} else if (w_row[1] < 0 && w_px[1] > 0) {
				pxmin = swr_maxi(pxmin, -(w_row[1] / w_px[1]) + 1);
			}

			// Calculate x range based on w2...
			if (w_row[2] >= 0 && w_px[2] < 0) {
				pxmax = swr_mini(pxmax, -(w_row[2] / w_px[2]));
			} else if (w_row[2] < 0 && w_px[2] > 0) {
				pxmin = swr_maxi(pxmin, -(w_row[2] / w_px[2]) + 1);
			}
		}

		// Calculate barycentric coords at pxmin
		const vec4i v_pxmin = vec4i_fromInt(pxmin);
		vec4i v_w = vec4i_add(v_w_row, vec4i_mullo_SSE2(v_w_px, v_pxmin));

		for (int32_t px = pxmin; px <= pxmax; ++px) {
			// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
			// Render the pixel
			{
#if SWR_CONFIG_NO_PIXEL_SHADER
				const uint32_t rgba = 0xFFFFFFFF;
#else
				int32_t w[4];
				vec4i_toInt4vu(v_w, &w[0]);
				assert(w[0] >= 0 && w[1] >= 0 && w[2] >= 0);

				const vec4f v_l = vec4f_mul(vec4f_fromVec4i(v_w), v_inv_area);
				const vec4f v_l0 = vec4f_getXXXX(v_l);
				const vec4f v_l1 = vec4f_getYYYY(v_l);

				// l2 = 1.0f - (l0 + l1)
				//
				// attr = attr0 * l0 + attr1 * l1 + attr2 * l2                <=>
				// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1)   <=>
				// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
				// attr = dattr02 * l0 + dattr12 * l1 + attr2                 <=>
				//
				// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
				const vec4f v_c = vec4f_madd_SSE2(v_c02, v_l0, vec4f_madd_SSE2(v_c12, v_l1, v_c2));
				const uint32_t rgba = vec4f_toRGBA8(v_c);
#endif
				fb_row[px] = rgba;
			}

			v_w = vec4i_add(v_w, v_w_px);
		}

		v_w_row = vec4i_add(v_w_row, v_w_py);
		fb_row += ctx->m_Width;
	}
}
#endif


#if 0
// Old implementations (see Triangle Rasterizations posts)
//////////////////////////////////////////////////////////////////////////
// SSE2 implementation
//
#define USE_VEC4_LIB 0

// http://dss.stephanierct.com/DevBlog/?p=8
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };

static inline __m128 _mm_floor_ps_SSE2(__m128 x)
{
	__m128 j = _mm_load_ps(&xmm_ones[0]);
	__m128i i = _mm_cvttps_epi32(x);
	__m128 fi = _mm_cvtepi32_ps(i);
	__m128 igx = _mm_cmpgt_ps(fi, x);
	j = _mm_and_ps(igx, j);
	return _mm_sub_ps(fi, j);
}

static inline __m128 _mm_ceil_ps_SSE2(__m128 x)
{
	__m128 j = _mm_load_ps(&xmm_ones[0]);
	__m128i i = _mm_cvttps_epi32(x);
	__m128 fi = _mm_cvtepi32_ps(i);
	__m128 igx = _mm_cmplt_ps(fi, x);
	j = _mm_and_ps(igx, j);
	return _mm_add_ps(fi, j);
}

static inline __m128i _mm_mullo_epi32_SSE2(__m128i a, __m128i b)
{
	__m128i tmp1 = _mm_mul_epu32(a, b);
	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
}

#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8))

static void swrDrawTriangleSSE2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
	const int32_t bboxWidth = bboxMaxX - bboxMinX;
	const int32_t bboxHeight = bboxMaxY - bboxMinY;

	const int32_t dy01 = y0 - y1;
	const int32_t dx01 = x0 - x1;
	const int32_t dx20 = x2 - x0;
	const int32_t dy20 = y2 - y0;
	const int32_t dy01_dy20 = dy01 + dy20;

#if USE_VEC4_LIB
	const vec4f v4f_rgba0 = vec4f_fromRGBA8(color0);
	const vec4f v4f_rgba1 = vec4f_fromRGBA8(color1);
	const vec4f v4f_rgba2 = vec4f_fromRGBA8(color2);

	const vec4f v4f_drgba20 = vec4f_sub(v4f_rgba2, v4f_rgba0);
	const vec4f v4f_drgba10 = vec4f_sub(v4f_rgba1, v4f_rgba0);

	const vec4f v4f_r0 = vec4f_getXXXX(v4f_rgba0);
	const vec4f v4f_g0 = vec4f_getYYYY(v4f_rgba0);
	const vec4f v4f_b0 = vec4f_getZZZZ(v4f_rgba0);
	const vec4f v4f_a0 = vec4f_getWWWW(v4f_rgba0);
	const vec4f v4f_dr20 = vec4f_getXXXX(v4f_drgba20);
	const vec4f v4f_dg20 = vec4f_getYYYY(v4f_drgba20);
	const vec4f v4f_db20 = vec4f_getZZZZ(v4f_drgba20);
	const vec4f v4f_da20 = vec4f_getWWWW(v4f_drgba20);
	const vec4f v4f_dr10 = vec4f_getXXXX(v4f_drgba10);
	const vec4f v4f_dg10 = vec4f_getYYYY(v4f_drgba10);
	const vec4f v4f_db10 = vec4f_getZZZZ(v4f_drgba10);
	const vec4f v4f_da10 = vec4f_getWWWW(v4f_drgba10);

	const vec4f v4f_inv_area = vec4f_fromFloat(1.0f / (float)iarea);

	// Barycentric coordinate deltas for the X direction
	const vec4i v4i_x_duvw_ = vec4i_fromInt4(-dy01, -dy20, dy01_dy20, 0);
	const vec4f v4f_x_duvw_1 = vec4f_mul(vec4f_fromVec4i(v4i_x_duvw_), v4f_inv_area);
	const vec4f v4f_x_duvw_2 = vec4f_add(v4f_x_duvw_1, v4f_x_duvw_1);
	const vec4f v4f_x_duvw_3 = vec4f_add(v4f_x_duvw_1, v4f_x_duvw_2);
	const vec4f v4f_x_duvw_4 = vec4f_add(v4f_x_duvw_2, v4f_x_duvw_2);

	// UV deltas for the 1st and 2nd pixel
	const vec4f v4f_x_duv0_duv1 = vec4f_shuffle(vec4f_zero(), v4f_x_duvw_1, VEC4F_SHUFFLE_XYXY);

	// UV deltas for the 3rd and 4th pixel
	const vec4f v4f_x_duv2_duv3 = vec4f_shuffle(v4f_x_duvw_2, v4f_x_duvw_3, VEC4F_SHUFFLE_XYXY);

	const vec4f v4f_x_du4 = vec4f_getXXXX(v4f_x_duvw_4);
	const vec4f v4f_x_dv4 = vec4f_getYYYY(v4f_x_duvw_4);

	// Barycentric coordinate deltas for the Y direction
	const vec4i v4i_y_duvw_ = vec4i_fromInt4(dx01, dx20, -(dx01 + dx20), 0);

	// Calculate unnormalized barycentric coordinates of the bounding box min.
	const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
	const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
	const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
	vec4i v4i_row_uvw_ = vec4i_fromInt4(bboxMin_u, bboxMin_v, bboxMin_w, 0);

	//
	const vec4f v4f_row_uvw_scale = vec4f_fromFloat4(1.0f / (float)dy01, 1.0f / (float)dy20, 1.0f / (float)dy01_dy20, 0.0f);
#else
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
	const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
	const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
	const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
	const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);

	const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
	const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
	const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));

	const __m128 xmm_zero = _mm_setzero_ps();
	const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);

	// Barycentric coordinate deltas for the X direction
	const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
	const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
	const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
	const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
	const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);

	// UV deltas for the 1st and 2nd pixel
	const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));

	// UV deltas for the 3rd and 4th pixel
	const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));

	const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));

	// Barycentric coordinate deltas for the Y direction
	const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);

	// Calculate unnormalized barycentric coordinates of the bounding box min.
	const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
	const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
	const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
	__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);

	//
	const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
#endif

	uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
	for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
		int32_t ixmin = 0;
		int32_t ixmax = (uint32_t)bboxWidth;

		// Calculate ixmin and ixmax
		{
			int32_t row_uvw_[4];
#if USE_VEC4_LIB
			vec4i_toInt4vu(v4i_row_uvw_, &row_uvw_[0]);

			const vec4f v4f_row_uvw_ = vec4f_mul(vec4f_fromVec4i(v4i_row_uvw_), v4f_row_uvw_scale);
			const vec4i v4i_row_uvw_floor = vec4i_fromVec4f(vec4f_floor_SSE2(v4f_row_uvw_));
			const vec4i v4i_row_uvw_ceil = vec4i_fromVec4f(vec4f_ceil_SSE2(v4f_row_uvw_));

			int32_t row_uvw_floor[4];
			vec4i_toInt4vu(v4i_row_uvw_floor, &row_uvw_floor[0]);

			int32_t row_uvw_ceil[4];
			vec4i_toInt4vu(v4i_row_uvw_ceil, &row_uvw_ceil[0]);
#else
			_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);

			const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
			const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
			const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));

			int32_t row_uvw_floor[4];
			_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);

			int32_t row_uvw_ceil[4];
			_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
#endif

			if (dy01 > 0) {
				ixmax = swr_mini(ixmax, row_uvw_floor[0]);
			} else if (row_uvw_[0] != 0) {
				ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
			}

			if (dy20 > 0) {
				ixmax = swr_mini(ixmax, row_uvw_floor[1]);
			} else if (row_uvw_[1] != 0) {
				ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
			}

			if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
				ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
			} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
				ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
			}
		}

		if (ixmin <= ixmax) {
#if USE_VEC4_LIB
			// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
			const vec4i v4i_p0uvw_ = vec4i_add(v4i_row_uvw_, vec4i_mullo_SSE2(vec4i_fromInt(ixmin), v4i_x_duvw_));
			const vec4f v4f_p0uvw_ = vec4f_mul(vec4f_fromVec4i(v4i_p0uvw_), v4f_inv_area);
			const vec4f v4f_p0uvuv = vec4f_getXYXY(v4f_p0uvw_);

			// Calculate barycentric coordinates for the 4 pixels.
			const vec4f v4f_p0uv_p1uv = vec4f_add(v4f_p0uvuv, v4f_x_duv0_duv1);
			const vec4f v4f_p2uv_p3uv = vec4f_add(v4f_p0uvuv, v4f_x_duv2_duv3);

			// Extract barycentric coordinates for each pixel
			vec4f v4f_u0123 = vec4f_shuffle(v4f_p0uv_p1uv, v4f_p2uv_p3uv, VEC4F_SHUFFLE_XZXZ);
			vec4f v4f_v0123 = vec4f_shuffle(v4f_p0uv_p1uv, v4f_p2uv_p3uv, VEC4F_SHUFFLE_YWYW);
#else
			// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
			const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
			const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
			const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));

			// Calculate barycentric coordinates for the 4 pixels.
			const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
			const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels

			// Extract barycentric coordinates for each pixel
			__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
			__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
#endif

			uint32_t* frameBuffer = &framebufferRow[ixmin];
			const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
			const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
			for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
				// Calculate the color of each pixel
#if USE_VEC4_LIB
				const vec4f v4f_r_p0123 = vec4f_add(v4f_r0, vec4f_add(vec4f_mul(v4f_dr10, v4f_v0123), vec4f_mul(v4f_dr20, v4f_u0123)));
				const vec4f v4f_g_p0123 = vec4f_add(v4f_g0, vec4f_add(vec4f_mul(v4f_dg10, v4f_v0123), vec4f_mul(v4f_dg20, v4f_u0123)));
				const vec4f v4f_b_p0123 = vec4f_add(v4f_b0, vec4f_add(vec4f_mul(v4f_db10, v4f_v0123), vec4f_mul(v4f_db20, v4f_u0123)));
				const vec4f v4f_a_p0123 = vec4f_add(v4f_a0, vec4f_add(vec4f_mul(v4f_da10, v4f_v0123), vec4f_mul(v4f_da20, v4f_u0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(v4f_r_p0123.m_XMM), _mm_cvtps_epi32(v4f_g_p0123.m_XMM)),
					_mm_packs_epi32(_mm_cvtps_epi32(v4f_b_p0123.m_XMM), _mm_cvtps_epi32(v4f_a_p0123.m_XMM))
				);
#else
				const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
				const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
				const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
				const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
				);
#endif

				// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
				// _mm_shuffle_epi8() with SSE2
				__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);

				// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
				const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
					_mm_packus_epi16(
						_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
						_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
					);

				// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
				const __m128i imm_rgba_p0123_u8 =
					_mm_packus_epi16(
						_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
						_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
					);

				// Store
				_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);

				// Move on to the next set of pixels
#if USE_VEC4_LIB
				v4f_u0123 = vec4f_add(v4f_u0123, v4f_x_du4);
				v4f_v0123 = vec4f_add(v4f_u0123, v4f_x_dv4);
#else
				xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
				xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
#endif
				frameBuffer += 4;
			}

			// Calculate the colors of the 4 next pixels and selectively store only the number
			// of remainder pixels for this row
			const uint32_t rem = numPixels & 3;
			{
#if USE_VEC4_LIB
				const vec4f v4f_r_p0123 = vec4f_madd_SSE2(v4f_dr10, v4f_v0123, vec4f_madd_SSE2(v4f_dr20, v4f_u0123, v4f_r0));
				const vec4f v4f_g_p0123 = vec4f_madd_SSE2(v4f_dg10, v4f_v0123, vec4f_madd_SSE2(v4f_dg20, v4f_u0123, v4f_g0));
				const vec4f v4f_b_p0123 = vec4f_madd_SSE2(v4f_db10, v4f_v0123, vec4f_madd_SSE2(v4f_db20, v4f_u0123, v4f_b0));
				const vec4f v4f_a_p0123 = vec4f_madd_SSE2(v4f_da10, v4f_v0123, vec4f_madd_SSE2(v4f_da20, v4f_u0123, v4f_a0));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(v4f_r_p0123.m_XMM), _mm_cvtps_epi32(v4f_g_p0123.m_XMM)),
					_mm_packs_epi32(_mm_cvtps_epi32(v4f_b_p0123.m_XMM), _mm_cvtps_epi32(v4f_a_p0123.m_XMM))
				);
#else
				const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
				const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
				const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
				const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
				);
#endif

				// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
				// _mm_shuffle_epi8() with SSE2
				__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);

				// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
				const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
					_mm_packus_epi16(
						_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
						_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
					);

				// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
				const __m128i imm_rgba_p0123_u8 =
					_mm_packus_epi16(
						_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
						_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
					);

				switch (rem) {
				case 1:
					_mm_storeu_si32(frameBuffer, imm_rgba_p0123_u8);
					break;
				case 2:
					_mm_storeu_si64(frameBuffer, imm_rgba_p0123_u8);
					break;
				case 3:
					_mm_storeu_si64(frameBuffer, imm_rgba_p0123_u8);
					_mm_storeu_si32(&frameBuffer[2], _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(2, 2, 2, 2)));
					break;
				case 0:
				default:
					break;
				}
			}
		}

#if USE_VEC4_LIB
		// Move on to the next row of pixels.
		v4i_row_uvw_ = vec4i_add(v4i_row_uvw_, v4i_y_duvw_);
#else
		// Move on to the next row of pixels.
		imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
#endif
		framebufferRow += ctx->m_Width;
	}
}

static void swrDrawTriangleSSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
	const int32_t bboxWidth = bboxMaxX - bboxMinX;
	const int32_t bboxHeight = bboxMaxY - bboxMinY;

	const __m128i imm_zero = _mm_setzero_si128();
	const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
	const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
	const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
	const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
	const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);

	const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
	const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
	const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));

	const int32_t dy01 = y0 - y1;
	const int32_t dx01 = x0 - x1;
	const int32_t dx20 = x2 - x0;
	const int32_t dy20 = y2 - y0;
	const int32_t dy01_dy20 = dy01 + dy20;

	const __m128 xmm_zero = _mm_setzero_ps();
	const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);

	// Barycentric coordinate deltas for the X direction
	const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
	const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
	const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
	const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
	const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);

	// UV deltas for the 1st and 2nd pixel
	const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));

	// UV deltas for the 3rd and 4th pixel
	const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));

	const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));

	// Barycentric coordinate deltas for the Y direction
	const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);

	// Calculate unnormalized barycentric coordinates of the bounding box min.
	const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
	const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
	const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
	__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);

	//
	const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);

	uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
	for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
		int32_t ixmin = 0;
		int32_t ixmax = (uint32_t)bboxWidth;

		// Calculate ixmin and ixmax
		{
			int32_t row_uvw_[4];
			_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);

			const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
			const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
			const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));

			int32_t row_uvw_floor[4];
			_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);

			int32_t row_uvw_ceil[4];
			_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);

			if (dy01 > 0) {
				ixmax = swr_mini(ixmax, row_uvw_floor[0]);
			} else if (row_uvw_[0] != 0) {
				ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
			}

			if (dy20 > 0) {
				ixmax = swr_mini(ixmax, row_uvw_floor[1]);
			} else if (row_uvw_[1] != 0) {
				ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
			}

			if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
				ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
			} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
				ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
			}
		}

		if (ixmin <= ixmax) {
			// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
			const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
			const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
			const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));

			// Calculate barycentric coordinates for the 4 pixels.
			const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
			const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels

			// Extract barycentric coordinates for each pixel
			__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
			__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));

			uint32_t* frameBuffer = &framebufferRow[ixmin];
			const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
			const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
			for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
				// Calculate the color of each pixel
				const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
				const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
				const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
				const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
				);

				// Shuffle into RGBA uint32_t
				const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
				const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);

				// Store
				_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);

				// Move on to the next set of pixels
				xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
				xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
				frameBuffer += 4;
			}

			// Calculate the colors of the 4 next pixels and selectively store only the number
			// of remainder pixels for this row
			const uint32_t rem = numPixels & 3;
			if (rem != 0) {
				// Calculate the color of each pixel
				const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
				const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
				const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
				const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
				);

				// Shuffle into RGBA uint32_t
				const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
				const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);

				// Store
				_mm_storeu_si32(frameBuffer, imm_rgba_p0123_u8);
				frameBuffer++;
				if (rem == 2) {
					_mm_storeu_si32(frameBuffer, _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(1, 1, 1, 1)));
				} else if (rem == 3) {
					_mm_storeu_si64(frameBuffer, _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(2, 1, 2, 1)));
				}
			}
		}

		// Move on to the next row of pixels.
		imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
		framebufferRow += ctx->m_Width;
	}
}

static void swrDrawTriangleSSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
	int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
	if (iarea == 0) {
		// Degenerate triangle with 0 area.
		return;
	} else if (iarea < 0) {
		// Swap (x1, y1) <-> (x2, y2)
		{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
		{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
		{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
		iarea = -iarea;
	}

	const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
	const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
	const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
	const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
	const int32_t bboxWidth = bboxMaxX - bboxMinX;
	const int32_t bboxHeight = bboxMaxY - bboxMinY;

	const __m128i imm_zero = _mm_setzero_si128();
	const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
	const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
	const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
	const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
	const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);

	const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
	const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
	const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
	const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
	const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));

	const int32_t dy01 = y0 - y1;
	const int32_t dx01 = x0 - x1;
	const int32_t dx20 = x2 - x0;
	const int32_t dy20 = y2 - y0;
	const int32_t dy01_dy20 = dy01 + dy20;

	const __m128 xmm_zero = _mm_setzero_ps();
	const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);

	// Barycentric coordinate deltas for the X direction
	const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
	const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
	const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
	const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
	const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);

	// UV deltas for the 1st and 2nd pixel
	const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));

	// UV deltas for the 3rd and 4th pixel
	const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));

	const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
	const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));

	// Barycentric coordinate deltas for the Y direction
	const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);

	// Calculate unnormalized barycentric coordinates of the bounding box min.
	const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
	const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
	const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
	__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);

	//
	const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);

	uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
	for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
		int32_t ixmin = 0;
		int32_t ixmax = (uint32_t)bboxWidth;

		// Calculate ixmin and ixmax
		{
			int32_t row_uvw_[4];
			_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);

			const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
			const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps(xmm_row_uvw_));
			const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps(xmm_row_uvw_));

			int32_t row_uvw_floor[4];
			_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);

			int32_t row_uvw_ceil[4];
			_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);

			if (dy01 > 0) {
				ixmax = swr_mini(ixmax, row_uvw_floor[0]);
			} else if (row_uvw_[0] != 0) {
				ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
			}

			if (dy20 > 0) {
				ixmax = swr_mini(ixmax, row_uvw_floor[1]);
			} else if (row_uvw_[1] != 0) {
				ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
			}

			if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
				ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
			} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
				ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
			}
		}

		if (ixmin <= ixmax) {
			// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
			const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32(_mm_set1_epi32(ixmin), imm_x_duvw_));
			const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
			const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));

			// Calculate barycentric coordinates for the 4 pixels.
			const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
			const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels

			// Extract barycentric coordinates for each pixel
			__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
			__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));

			uint32_t* frameBuffer = &framebufferRow[ixmin];
			const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
			const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
			for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
				// Calculate the color of each pixel
				const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
				const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
				const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
				const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
				);

				// Shuffle into RGBA uint32_t
				const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
				const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);

				// Store
				_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);

				// Move on to the next set of pixels
				xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
				xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
				frameBuffer += 4;
			}

			// Calculate the colors of the 4 next pixels and selectively store only the number
			// of remainder pixels for this row
			const uint32_t rem = numPixels & 3;
			{
				// Calculate the color of each pixel
				const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
				const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
				const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
				const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));

				// Pack into uint8_t
				// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
				const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
					_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
				);

				// Shuffle into RGBA uint32_t
				const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
				const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);

				// Load existing frame buffer values.
				const __m128i imm_frameBuffer = _mm_lddqu_si128((const __m128i*)frameBuffer);

				// Replace only the number of remainder pixels
				const __m128 blendMask = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_set_epi32(rem, rem, rem, rem), _mm_set_epi32(3, 2, 1, 0)));
				const __m128i xmm_newFrameBuffer = _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(imm_frameBuffer), _mm_castsi128_ps(imm_rgba_p0123_u8), blendMask));

				// Store
				_mm_storeu_si128((__m128i*)frameBuffer, xmm_newFrameBuffer);
			}
		}

		// Move on to the next row of pixels.
		imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
		framebufferRow += ctx->m_Width;
	}
}
#endif

## swr_math.h
#ifndef SWR_SWR_MATH_H
#define SWR_SWR_MATH_H

#include <stdint.h>
#include <stdbool.h>
#include <immintrin.h>

typedef struct vec4f
{
	__m128 m_XMM;
} vec4f;

typedef struct vec4i
{
	__m128i m_IMM;
} vec4i;

static inline vec4f vec4f_zero(void)
{
	return (vec4f){ .m_XMM = _mm_setzero_ps() };
}

static inline vec4f vec4f_fromFloat(float x)
{
	return (vec4f){ .m_XMM = _mm_set_ps1(x) };
}

static inline vec4f vec4f_fromVec4i(vec4i x)
{
	return (vec4f){ .m_XMM = _mm_cvtepi32_ps(x.m_IMM) };
}

static inline vec4f vec4f_fromFloat4(float x0, float x1, float x2, float x3)
{
	return (vec4f){ .m_XMM = _mm_set_ps(x3, x2, x1, x0) };
}

static inline vec4f vec4f_fromRGBA8(uint32_t rgba8)
{
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128i imm_rgba8 = _mm_cvtsi32_si128(rgba8);
	const __m128i imm_rgba16 = _mm_unpacklo_epi8(imm_rgba8, imm_zero);
	const __m128i imm_rgba32 = _mm_unpacklo_epi16(imm_rgba16, imm_zero);
	return (vec4f){
		.m_XMM = _mm_cvtepi32_ps(imm_rgba32)
	};
}

static inline uint32_t vec4f_toRGBA8(vec4f x)
{
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128i imm_rgba32 = _mm_cvtps_epi32(x.m_XMM);
	const __m128i imm_rgba16 = _mm_packs_epi32(imm_rgba32, imm_zero);
	const __m128i imm_rgba8 = _mm_packus_epi16(imm_rgba16, imm_zero);
	return (uint32_t)_mm_cvtsi128_si32(imm_rgba8);
}

static inline vec4f vec4f_add(vec4f a, vec4f b)
{
	return (vec4f){ .m_XMM = _mm_add_ps(a.m_XMM, b.m_XMM) };
}

static inline vec4f vec4f_sub(vec4f a, vec4f b)
{
	return (vec4f){ .m_XMM = _mm_sub_ps(a.m_XMM, b.m_XMM) };
}

static inline vec4f vec4f_mul(vec4f a, vec4f b)
{
	return (vec4f){ .m_XMM = _mm_mul_ps(a.m_XMM, b.m_XMM) };
}

#define VEC4_SHUFFLE_MASK(d0_a, d1_a, d2_b, d3_b) (((d3_b) << 6) | ((d2_b) << 4) | ((d1_a) << 2) | ((d0_a)))

typedef enum vec4_shuffle_mask
{
	VEC4_SHUFFLE_XXXX = VEC4_SHUFFLE_MASK(0, 0, 0, 0),
	VEC4_SHUFFLE_YYYY = VEC4_SHUFFLE_MASK(1, 1, 1, 1),
	VEC4_SHUFFLE_ZZZZ = VEC4_SHUFFLE_MASK(2, 2, 2, 2),
	VEC4_SHUFFLE_WWWW = VEC4_SHUFFLE_MASK(3, 3, 3, 3),
	VEC4_SHUFFLE_XYXY = VEC4_SHUFFLE_MASK(0, 1, 0, 1),
	VEC4_SHUFFLE_XZXZ = VEC4_SHUFFLE_MASK(0, 2, 0, 2),
	VEC4_SHUFFLE_YWYW = VEC4_SHUFFLE_MASK(1, 3, 1, 3),
	VEC4_SHUFFLE_ZWZW = VEC4_SHUFFLE_MASK(2, 3, 2, 3),
} vec4_shuffle_mask;

#define VEC4F_GET_FUNC(swizzle) \
static inline vec4f vec4f_get##swizzle(vec4f x) \
{ \
	return (vec4f){ .m_XMM = _mm_shuffle_ps(x.m_XMM, x.m_XMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \
}

VEC4F_GET_FUNC(XXXX);
VEC4F_GET_FUNC(YYYY);
VEC4F_GET_FUNC(ZZZZ);
VEC4F_GET_FUNC(WWWW);
VEC4F_GET_FUNC(XYXY);
VEC4F_GET_FUNC(ZWZW);

// Function-like macro because mask must be an immediate (constant)
#define vec4f_shuffle(a, b, mask) (vec4f){ .m_XMM = _mm_shuffle_ps(a.m_XMM, b.m_XMM, mask) }

// http://dss.stephanierct.com/DevBlog/?p=8
static inline vec4f vec4f_floor_SSE2(vec4f x)
{
	static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
	const __m128i i = _mm_cvttps_epi32(x.m_XMM);
	const __m128 fi = _mm_cvtepi32_ps(i);
	const __m128 igx = _mm_cmpgt_ps(fi, x.m_XMM);
	const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0]));
	return (vec4f){ .m_XMM = _mm_sub_ps(fi, j) };
}

// http://dss.stephanierct.com/DevBlog/?p=8
static inline vec4f vec4f_ceil_SSE2(vec4f x)
{
	static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
	const __m128i i = _mm_cvttps_epi32(x.m_XMM);
	const __m128 fi = _mm_cvtepi32_ps(i);
	const __m128 igx = _mm_cmplt_ps(fi, x.m_XMM);
	const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0]));
	return (vec4f){ .m_XMM = _mm_add_ps(fi, j) };
}

static inline vec4f vec4f_floor_SSE41(vec4f x)
{
	return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_FLOOR) };
}

static inline vec4f vec4f_ceil_SSE41(vec4f x)
{
	return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_CEIL) };
}

static inline vec4f vec4f_madd_SSE2(vec4f a, vec4f b, vec4f c)
{
	return (vec4f){ .m_XMM = _mm_add_ps(c.m_XMM, _mm_mul_ps(a.m_XMM, b.m_XMM)) };
}

static inline vec4i vec4i_zero(void)
{
	return (vec4i){ .m_IMM = _mm_setzero_si128()};
}

static inline vec4i vec4i_one(void)
{
	return (vec4i){ .m_IMM = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()) };
}

static inline vec4i vec4i_fromInt(int32_t x)
{
	return (vec4i){ .m_IMM = _mm_set1_epi32(x) };
}

static inline vec4i vec4i_fromVec4f(vec4f x)
{
	return (vec4i){ .m_IMM = _mm_cvtps_epi32(x.m_XMM) };
}

static inline vec4i vec4i_fromInt4(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
{
	return (vec4i){ .m_IMM = _mm_set_epi32(x3, x2, x1, x0) };
}

static inline vec4i vec4i_fromInt4va(const int32_t* arr)
{
	return (vec4i){ .m_IMM = _mm_load_si128((const __m128i*)arr) };
}

static inline void vec4i_toInt4vu(vec4i x, int32_t* arr)
{
	_mm_storeu_si128((__m128i*)arr, x.m_IMM);
}

static inline void vec4i_toInt4va(vec4i x, int32_t* arr)
{
	_mm_store_si128((__m128i*)arr, x.m_IMM);
}

static inline void vec4i_toInt4va_masked(vec4i x, vec4i mask, int32_t* buffer)
{
#if 0
	_mm_maskmoveu_si128(x.m_IMM, mask.m_IMM, (char*)buffer);
#else
	const __m128i old = _mm_load_si128((const __m128i*)buffer);
	const __m128i oldMasked = _mm_andnot_si128(mask.m_IMM, old);
	const __m128i newMasked = _mm_and_si128(mask.m_IMM, x.m_IMM);
	const __m128i final = _mm_or_si128(oldMasked, newMasked);
	_mm_store_si128((__m128i*)buffer, final);
#endif
}

static inline void vec4i_toInt4va_maskedInv_SSE2(vec4i x, vec4i maskInv, int32_t* buffer)
{
#if 0
	static const uint32_t ones[] = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
	const __m128i imm_ones = _mm_load_si128((const __m128i*)ones);
	const __m128i imm_mask = _mm_xor_si128(maskInv.m_IMM, imm_ones);
	_mm_maskmoveu_si128(x.m_IMM, imm_mask, (char*)buffer);
#else
	const __m128i old = _mm_load_si128((const __m128i*)buffer);
	const __m128i oldMasked = _mm_and_si128(maskInv.m_IMM, old);
	const __m128i newMasked = _mm_andnot_si128(maskInv.m_IMM, x.m_IMM);
	const __m128i final = _mm_or_si128(oldMasked, newMasked);
	_mm_store_si128((__m128i*)buffer, final);
#endif
}

static inline void vec4i_toInt4va_maskedInv_SSE41(vec4i x, vec4i maskInv, int32_t* buffer)
{
	const __m128i old = _mm_load_si128((const __m128i*)buffer);
	const __m128i final = _mm_blendv_epi8(x.m_IMM, old, maskInv.m_IMM);
	_mm_store_si128((__m128i*)buffer, final);
}

static inline int32_t vec4i_toInt(vec4i x)
{
	return _mm_cvtsi128_si32(x.m_IMM);
}

static inline vec4i vec4i_add(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_add_epi32(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_sub(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_sub_epi32(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_mullo_SSE2(vec4i a, vec4i b)
{
	const __m128i tmp1 = _mm_mul_epu32(a.m_IMM, b.m_IMM);
	const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a.m_IMM, 4), _mm_srli_si128(b.m_IMM, 4));
	return (vec4i){ .m_IMM = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))) };
}

static inline vec4i vec4i_mullo_SSE41(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_mullo_epi32(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_and(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_and_si128(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_or(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_or3(vec4i a, vec4i b, vec4i c)
{
	return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, _mm_or_si128(b.m_IMM, c.m_IMM)) };
}

static inline vec4i vec4i_andnot(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_andnot_si128(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_xor(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_xor_si128(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_sar(vec4i x, uint32_t shift)
{
	return (vec4i){ .m_IMM = _mm_srai_epi32(x.m_IMM, shift) };
}

static inline vec4i vec4i_sal(vec4i x, uint32_t shift)
{
	return (vec4i){ .m_IMM = _mm_slli_epi32(x.m_IMM, shift) };
}

static inline vec4i vec4i_cmplt(vec4i a, vec4i b)
{
	return (vec4i){ .m_IMM = _mm_cmplt_epi32(a.m_IMM, b.m_IMM) };
}

static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSE2(vec4i r, vec4i g, vec4i b, vec4i a)
{
	// Pack into uint8_t
	// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
	const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
		_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM)
	);

	// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
	// _mm_shuffle_epi8() with SSE2
	__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);

	// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
	const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
		_mm_packus_epi16(
			_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
			_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
		);

	// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
	const __m128i imm_rgba_p0123_u8 =
		_mm_packus_epi16(
			_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
			_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
		);

	return (vec4i){ .m_IMM = imm_rgba_p0123_u8 };
}

static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSSE3(vec4i r, vec4i g, vec4i b, vec4i a)
{
	const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);

	// Pack into uint8_t
	// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
	const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
		_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM)
	);
	const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
	return (vec4i){ .m_IMM = imm_rgba_p0123_u8 };
}

static inline bool vec4i_any_neg_SSE2(vec4i x)
{
#if 1
	return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) != 0;
#else
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128i imm_cmp = _mm_cmplt_epi32(x.m_IMM, imm_zero);
	return _mm_movemask_ps(_mm_castsi128_ps(imm_cmp)) != 0;
#endif
}

static inline bool vec4i_all_neg_SSE2(vec4i x)
{
	return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) == 0x0F;
}

static inline uint32_t vec4i_getSignMask(vec4i x)
{
	return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM));
}

#define VEC4I_GET_FUNC(swizzle) \
static inline vec4i vec4i_get##swizzle(vec4i x) \
{ \
	return (vec4i){ .m_IMM = _mm_shuffle_epi32(x.m_IMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \
}

VEC4I_GET_FUNC(XXXX);
VEC4I_GET_FUNC(YYYY);
VEC4I_GET_FUNC(ZZZZ);
VEC4I_GET_FUNC(WWWW);
VEC4I_GET_FUNC(XYXY);
VEC4I_GET_FUNC(ZWZW);

int32_t swr_absi(int32_t x);
int32_t swr_mini(int32_t a, int32_t b);
int32_t swr_maxi(int32_t a, int32_t b);
int32_t swr_min3i(int32_t a, int32_t b, int32_t c);
int32_t swr_max3i(int32_t a, int32_t b, int32_t c);
int32_t swr_alignDown(int32_t x, uint32_t align);
int32_t swr_alignUp(int32_t x, uint32_t align);
int32_t swr_idiv_floor(int32_t numer, int32_t denom);
int32_t swr_idiv_ceil(int32_t numer, int32_t denom);

#endif

#include "inline/swr_math.inl"

## swr_math.inl
#ifndef SWR_SWR_MATH_H
#error "Must be included from swr_math.h"
#endif

static inline int32_t swr_absi(int32_t x)
{
	return x < 0 ? -x : x;
}

static inline int32_t swr_mini(int32_t a, int32_t b)
{
	return a < b ? a : b;
}

static inline int32_t swr_maxi(int32_t a, int32_t b)
{
	return a > b ? a : b;
}

static inline int32_t swr_min3i(int32_t a, int32_t b, int32_t c)
{
	return swr_mini(a, swr_mini(b, c));
}

static inline int32_t swr_max3i(int32_t a, int32_t b, int32_t c)
{
	return swr_maxi(a, swr_maxi(b, c));
}

static inline int32_t swr_alignDown(int32_t x, uint32_t align)
{
	return (x / align) * align;
}

static inline int32_t swr_alignUp(int32_t x, uint32_t align)
{
	return ((x / align) + ((x % align) != 0 ? 1 : 0)) * align;
}

static inline int32_t swr_idiv_floor(int32_t numer, int32_t denom)
{
	return numer / denom;
}

static inline int32_t swr_idiv_ceil(int32_t numer, int32_t denom)
{
	return (numer / denom) + ((numer % denom) != 0 ? 1 : 0);
}
	#ifndef SWR_SWR_MATH_H
	#define SWR_SWR_MATH_H

	#include <stdint.h>
	#include <stdbool.h>
	#include <immintrin.h>

	typedef struct vec4f
	{
	__m128 m_XMM;
	} vec4f;

	typedef struct vec4i
	{
	__m128i m_IMM;
	} vec4i;

	static inline vec4f vec4f_zero(void)
	{
	return (vec4f){ .m_XMM = _mm_setzero_ps() };
	}

	static inline vec4f vec4f_fromFloat(float x)
	{
	return (vec4f){ .m_XMM = _mm_set_ps1(x) };
	}

	static inline vec4f vec4f_fromVec4i(vec4i x)
	{
	return (vec4f){ .m_XMM = _mm_cvtepi32_ps(x.m_IMM) };
	}

	static inline vec4f vec4f_fromFloat4(float x0, float x1, float x2, float x3)
	{
	return (vec4f){ .m_XMM = _mm_set_ps(x3, x2, x1, x0) };
	}

	static inline vec4f vec4f_fromRGBA8(uint32_t rgba8)
	{
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128i imm_rgba8 = _mm_cvtsi32_si128(rgba8);
	const __m128i imm_rgba16 = _mm_unpacklo_epi8(imm_rgba8, imm_zero);
	const __m128i imm_rgba32 = _mm_unpacklo_epi16(imm_rgba16, imm_zero);
	return (vec4f){
	.m_XMM = _mm_cvtepi32_ps(imm_rgba32)
	};
	}

	static inline uint32_t vec4f_toRGBA8(vec4f x)
	{
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128i imm_rgba32 = _mm_cvtps_epi32(x.m_XMM);
	const __m128i imm_rgba16 = _mm_packs_epi32(imm_rgba32, imm_zero);
	const __m128i imm_rgba8 = _mm_packus_epi16(imm_rgba16, imm_zero);
	return (uint32_t)_mm_cvtsi128_si32(imm_rgba8);
	}

	static inline vec4f vec4f_add(vec4f a, vec4f b)
	{
	return (vec4f){ .m_XMM = _mm_add_ps(a.m_XMM, b.m_XMM) };
	}

	static inline vec4f vec4f_sub(vec4f a, vec4f b)
	{
	return (vec4f){ .m_XMM = _mm_sub_ps(a.m_XMM, b.m_XMM) };
	}

	static inline vec4f vec4f_mul(vec4f a, vec4f b)
	{
	return (vec4f){ .m_XMM = _mm_mul_ps(a.m_XMM, b.m_XMM) };
	}

	#define VEC4_SHUFFLE_MASK(d0_a, d1_a, d2_b, d3_b) (((d3_b) << 6) \| ((d2_b) << 4) \| ((d1_a) << 2) \| ((d0_a)))

	typedef enum vec4_shuffle_mask
	{
	VEC4_SHUFFLE_XXXX = VEC4_SHUFFLE_MASK(0, 0, 0, 0),
	VEC4_SHUFFLE_YYYY = VEC4_SHUFFLE_MASK(1, 1, 1, 1),
	VEC4_SHUFFLE_ZZZZ = VEC4_SHUFFLE_MASK(2, 2, 2, 2),
	VEC4_SHUFFLE_WWWW = VEC4_SHUFFLE_MASK(3, 3, 3, 3),
	VEC4_SHUFFLE_XYXY = VEC4_SHUFFLE_MASK(0, 1, 0, 1),
	VEC4_SHUFFLE_XZXZ = VEC4_SHUFFLE_MASK(0, 2, 0, 2),
	VEC4_SHUFFLE_YWYW = VEC4_SHUFFLE_MASK(1, 3, 1, 3),
	VEC4_SHUFFLE_ZWZW = VEC4_SHUFFLE_MASK(2, 3, 2, 3),
	} vec4_shuffle_mask;

	#define VEC4F_GET_FUNC(swizzle) \
	static inline vec4f vec4f_get##swizzle(vec4f x) \
	{ \
	return (vec4f){ .m_XMM = _mm_shuffle_ps(x.m_XMM, x.m_XMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \
	}

	VEC4F_GET_FUNC(XXXX);
	VEC4F_GET_FUNC(YYYY);
	VEC4F_GET_FUNC(ZZZZ);
	VEC4F_GET_FUNC(WWWW);
	VEC4F_GET_FUNC(XYXY);
	VEC4F_GET_FUNC(ZWZW);

	// Function-like macro because mask must be an immediate (constant)
	#define vec4f_shuffle(a, b, mask) (vec4f){ .m_XMM = _mm_shuffle_ps(a.m_XMM, b.m_XMM, mask) }

	// http://dss.stephanierct.com/DevBlog/?p=8
	static inline vec4f vec4f_floor_SSE2(vec4f x)
	{
	static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
	const __m128i i = _mm_cvttps_epi32(x.m_XMM);
	const __m128 fi = _mm_cvtepi32_ps(i);
	const __m128 igx = _mm_cmpgt_ps(fi, x.m_XMM);
	const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0]));
	return (vec4f){ .m_XMM = _mm_sub_ps(fi, j) };
	}

	// http://dss.stephanierct.com/DevBlog/?p=8
	static inline vec4f vec4f_ceil_SSE2(vec4f x)
	{
	static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
	const __m128i i = _mm_cvttps_epi32(x.m_XMM);
	const __m128 fi = _mm_cvtepi32_ps(i);
	const __m128 igx = _mm_cmplt_ps(fi, x.m_XMM);
	const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0]));
	return (vec4f){ .m_XMM = _mm_add_ps(fi, j) };
	}

	static inline vec4f vec4f_floor_SSE41(vec4f x)
	{
	return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_FLOOR) };
	}

	static inline vec4f vec4f_ceil_SSE41(vec4f x)
	{
	return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_CEIL) };
	}

	static inline vec4f vec4f_madd_SSE2(vec4f a, vec4f b, vec4f c)
	{
	return (vec4f){ .m_XMM = _mm_add_ps(c.m_XMM, _mm_mul_ps(a.m_XMM, b.m_XMM)) };
	}

	static inline vec4i vec4i_zero(void)
	{
	return (vec4i){ .m_IMM = _mm_setzero_si128()};
	}

	static inline vec4i vec4i_one(void)
	{
	return (vec4i){ .m_IMM = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()) };
	}

	static inline vec4i vec4i_fromInt(int32_t x)
	{
	return (vec4i){ .m_IMM = _mm_set1_epi32(x) };
	}

	static inline vec4i vec4i_fromVec4f(vec4f x)
	{
	return (vec4i){ .m_IMM = _mm_cvtps_epi32(x.m_XMM) };
	}

	static inline vec4i vec4i_fromInt4(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
	{
	return (vec4i){ .m_IMM = _mm_set_epi32(x3, x2, x1, x0) };
	}

	static inline vec4i vec4i_fromInt4va(const int32_t* arr)
	{
	return (vec4i){ .m_IMM = _mm_load_si128((const __m128i*)arr) };
	}

	static inline void vec4i_toInt4vu(vec4i x, int32_t* arr)
	{
	_mm_storeu_si128((__m128i*)arr, x.m_IMM);
	}

	static inline void vec4i_toInt4va(vec4i x, int32_t* arr)
	{
	_mm_store_si128((__m128i*)arr, x.m_IMM);
	}

	static inline void vec4i_toInt4va_masked(vec4i x, vec4i mask, int32_t* buffer)
	{
	#if 0
	_mm_maskmoveu_si128(x.m_IMM, mask.m_IMM, (char*)buffer);
	#else
	const __m128i old = _mm_load_si128((const __m128i*)buffer);
	const __m128i oldMasked = _mm_andnot_si128(mask.m_IMM, old);
	const __m128i newMasked = _mm_and_si128(mask.m_IMM, x.m_IMM);
	const __m128i final = _mm_or_si128(oldMasked, newMasked);
	_mm_store_si128((__m128i*)buffer, final);
	#endif
	}

	static inline void vec4i_toInt4va_maskedInv_SSE2(vec4i x, vec4i maskInv, int32_t* buffer)
	{
	#if 0
	static const uint32_t ones[] = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
	const __m128i imm_ones = _mm_load_si128((const __m128i*)ones);
	const __m128i imm_mask = _mm_xor_si128(maskInv.m_IMM, imm_ones);
	_mm_maskmoveu_si128(x.m_IMM, imm_mask, (char*)buffer);
	#else
	const __m128i old = _mm_load_si128((const __m128i*)buffer);
	const __m128i oldMasked = _mm_and_si128(maskInv.m_IMM, old);
	const __m128i newMasked = _mm_andnot_si128(maskInv.m_IMM, x.m_IMM);
	const __m128i final = _mm_or_si128(oldMasked, newMasked);
	_mm_store_si128((__m128i*)buffer, final);
	#endif
	}

	static inline void vec4i_toInt4va_maskedInv_SSE41(vec4i x, vec4i maskInv, int32_t* buffer)
	{
	const __m128i old = _mm_load_si128((const __m128i*)buffer);
	const __m128i final = _mm_blendv_epi8(x.m_IMM, old, maskInv.m_IMM);
	_mm_store_si128((__m128i*)buffer, final);
	}

	static inline int32_t vec4i_toInt(vec4i x)
	{
	return _mm_cvtsi128_si32(x.m_IMM);
	}

	static inline vec4i vec4i_add(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_add_epi32(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_sub(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_sub_epi32(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_mullo_SSE2(vec4i a, vec4i b)
	{
	const __m128i tmp1 = _mm_mul_epu32(a.m_IMM, b.m_IMM);
	const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a.m_IMM, 4), _mm_srli_si128(b.m_IMM, 4));
	return (vec4i){ .m_IMM = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))) };
	}

	static inline vec4i vec4i_mullo_SSE41(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_mullo_epi32(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_and(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_and_si128(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_or(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_or3(vec4i a, vec4i b, vec4i c)
	{
	return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, _mm_or_si128(b.m_IMM, c.m_IMM)) };
	}

	static inline vec4i vec4i_andnot(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_andnot_si128(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_xor(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_xor_si128(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_sar(vec4i x, uint32_t shift)
	{
	return (vec4i){ .m_IMM = _mm_srai_epi32(x.m_IMM, shift) };
	}

	static inline vec4i vec4i_sal(vec4i x, uint32_t shift)
	{
	return (vec4i){ .m_IMM = _mm_slli_epi32(x.m_IMM, shift) };
	}

	static inline vec4i vec4i_cmplt(vec4i a, vec4i b)
	{
	return (vec4i){ .m_IMM = _mm_cmplt_epi32(a.m_IMM, b.m_IMM) };
	}

	static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSE2(vec4i r, vec4i g, vec4i b, vec4i a)
	{
	// Pack into uint8_t
	// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
	const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
	_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM)
	);

	// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
	// _mm_shuffle_epi8() with SSE2
	__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);

	// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
	const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
	_mm_packus_epi16(
	_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
	_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
	);

	// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
	const __m128i imm_rgba_p0123_u8 =
	_mm_packus_epi16(
	_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
	_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
	);

	return (vec4i){ .m_IMM = imm_rgba_p0123_u8 };
	}

	static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSSE3(vec4i r, vec4i g, vec4i b, vec4i a)
	{
	const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);

	// Pack into uint8_t
	// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
	const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
	_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM)
	);
	const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
	return (vec4i){ .m_IMM = imm_rgba_p0123_u8 };
	}

	static inline bool vec4i_any_neg_SSE2(vec4i x)
	{
	#if 1
	return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) != 0;
	#else
	const __m128i imm_zero = _mm_setzero_si128();
	const __m128i imm_cmp = _mm_cmplt_epi32(x.m_IMM, imm_zero);
	return _mm_movemask_ps(_mm_castsi128_ps(imm_cmp)) != 0;
	#endif
	}

	static inline bool vec4i_all_neg_SSE2(vec4i x)
	{
	return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) == 0x0F;
	}

	static inline uint32_t vec4i_getSignMask(vec4i x)
	{
	return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM));
	}

	#define VEC4I_GET_FUNC(swizzle) \
	static inline vec4i vec4i_get##swizzle(vec4i x) \
	{ \
	return (vec4i){ .m_IMM = _mm_shuffle_epi32(x.m_IMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \
	}

	VEC4I_GET_FUNC(XXXX);
	VEC4I_GET_FUNC(YYYY);
	VEC4I_GET_FUNC(ZZZZ);
	VEC4I_GET_FUNC(WWWW);
	VEC4I_GET_FUNC(XYXY);
	VEC4I_GET_FUNC(ZWZW);

	int32_t swr_absi(int32_t x);
	int32_t swr_mini(int32_t a, int32_t b);
	int32_t swr_maxi(int32_t a, int32_t b);
	int32_t swr_min3i(int32_t a, int32_t b, int32_t c);
	int32_t swr_max3i(int32_t a, int32_t b, int32_t c);
	int32_t swr_alignDown(int32_t x, uint32_t align);
	int32_t swr_alignUp(int32_t x, uint32_t align);
	int32_t swr_idiv_floor(int32_t numer, int32_t denom);
	int32_t swr_idiv_ceil(int32_t numer, int32_t denom);

	#endif

	#include "inline/swr_math.inl"
	#ifndef SWR_SWR_MATH_H
	#error "Must be included from swr_math.h"
	#endif

	static inline int32_t swr_absi(int32_t x)
	{
	return x < 0 ? -x : x;
	}

	static inline int32_t swr_mini(int32_t a, int32_t b)
	{
	return a < b ? a : b;
	}

	static inline int32_t swr_maxi(int32_t a, int32_t b)
	{
	return a > b ? a : b;
	}

	static inline int32_t swr_min3i(int32_t a, int32_t b, int32_t c)
	{
	return swr_mini(a, swr_mini(b, c));
	}

	static inline int32_t swr_max3i(int32_t a, int32_t b, int32_t c)
	{
	return swr_maxi(a, swr_maxi(b, c));
	}

	static inline int32_t swr_alignDown(int32_t x, uint32_t align)
	{
	return (x / align) * align;
	}

	static inline int32_t swr_alignUp(int32_t x, uint32_t align)
	{
	return ((x / align) + ((x % align) != 0 ? 1 : 0)) * align;
	}

	static inline int32_t swr_idiv_floor(int32_t numer, int32_t denom)
	{
	return numer / denom;
	}

	static inline int32_t swr_idiv_ceil(int32_t numer, int32_t denom)
	{
	return (numer / denom) + ((numer % denom) != 0 ? 1 : 0);
	}