Skip to content

Instantly share code, notes, and snippets.

@jdryg
Last active January 4, 2023 18:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdryg/0947bf63db538b4d1e587ffa6fe8b642 to your computer and use it in GitHub Desktop.
Save jdryg/0947bf63db538b4d1e587ffa6fe8b642 to your computer and use it in GitHub Desktop.
Software Renderer
#include "swr.h"
#include "swr_math.h"
#include <stdbool.h>
#include <malloc.h>
#include <memory.h>
#include <string.h>
#include <assert.h>
#include <immintrin.h>
#define SWR_CONFIG_NO_PIXEL_SHADER 0
static swr_context* swrCreateContext(uint32_t w, uint32_t h);
static void swrDestroyContext(swr_context* ctx);
static void swrClear(swr_context* ctx, uint32_t color);
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color);
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color);
static void swrDrawTriangleDispatch(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef_Hierarchical(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleRef_HierarchicalLRB_NoCond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_Ref(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangleSSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color);
swr_api* swr = &(swr_api){
.createContext = swrCreateContext,
.destroyContext = swrDestroyContext,
.clear = swrClear,
.drawPixel = swrDrawPixel,
.drawLine = swrDrawLine,
.drawTriangle = swrDrawTriangleDispatch,
.drawText = swrDrawText
};
static swr_context* swrCreateContext(uint32_t w, uint32_t h)
{
swr_context* ctx = (swr_context*)malloc(sizeof(swr_context));
if (!ctx) {
return NULL;
}
memset(ctx, 0, sizeof(swr_context));
ctx->m_FrameBuffer = (uint32_t*)malloc(sizeof(uint32_t) * (size_t)w * (size_t)h);
if (!ctx->m_FrameBuffer) {
swrDestroyContext(ctx);
return NULL;
}
memset(ctx->m_FrameBuffer, 0, sizeof(uint32_t) * (size_t)w * (size_t)h);
ctx->m_Width = w;
ctx->m_Height = h;
return ctx;
}
static void swrDestroyContext(swr_context* ctx)
{
free(ctx->m_FrameBuffer);
free(ctx);
}
static void swrClear(swr_context* ctx, uint32_t color)
{
uint32_t* buffer = ctx->m_FrameBuffer;
const uint32_t numPixels = ctx->m_Width * ctx->m_Height;
for (uint32_t i = 0; i < numPixels; ++i) {
*buffer++ = color;
}
}
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color)
{
if (x < 0 || x >= (int32_t)ctx->m_Width || y < 0 || y >= (int32_t)ctx->m_Height) {
return;
}
ctx->m_FrameBuffer[x + y * ctx->m_Width] = color;
}
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color)
{
bool steep = false;
if (swr_absi(x0 - x1) < swr_absi(y0 - y1)) {
{ int32_t tmp = x0; x0 = y0; y0 = tmp; }
{ int32_t tmp = x1; x1 = y1; y1 = tmp; }
steep = true;
}
if (x0 > x1) {
{ int32_t tmp = x0; x0 = x1; x1 = tmp; }
{ int32_t tmp = y0; y0 = y1; y1 = tmp; }
}
const int32_t dx = x1 - x0;
const int32_t derror2 = swr_absi(y1 - y0) * 2;
const int32_t yinc = y1 > y0 ? 1 : -1;
int32_t error2 = 0;
int32_t y = y0;
if (steep) {
for (int32_t x = x0; x <= x1; x++) {
swrDrawPixel(ctx, y, x, color);
error2 += derror2;
if (error2 > dx) {
y += yinc;
error2 -= dx * 2;
}
}
} else {
for (int32_t x = x0; x <= x1; x++) {
swrDrawPixel(ctx, x, y, color);
error2 += derror2;
if (error2 > dx) {
y += yinc;
error2 -= dx * 2;
}
}
}
}
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color)
{
end = end != NULL
? end
: str + strlen(str)
;
const int32_t chw = (int32_t)font->m_CharWidth;
const int32_t chh = (int32_t)font->m_CharHeight;
const uint8_t* chdata = font->m_CharData;
int32_t x = x0;
int32_t y = y0;
while (str != end) {
char ch = *str;
if (ch < font->m_CharMin || ch > font->m_CharMax) {
ch = font->m_MissingCharFallbackID;
}
const uint8_t chID = (uint8_t)ch - font->m_CharMin;
const uint8_t* charData = &chdata[chID * chh];
for (int32_t chy = 0; chy < chh; ++chy) {
const uint8_t chrow = charData[chy];
for (int32_t chx = 0; chx < chw; ++chx) {
if ((chrow & (1u << chx)) != 0) {
swrDrawPixel(ctx, x + chx, y + chy, color);
}
}
}
x += chw;
++str;
}
}
static void swrDrawTriangleDispatch(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// TODO: Check CPU caps
#if 1
swr->drawTriangle = swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2;
#elif 1
swr->drawTriangle = swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2;
#elif 1
swr->drawTriangle = swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2;
#else
swr->drawTriangle = swrDrawTriangleRef_HierarchicalLRB_NoCond;
#endif
swr->drawTriangle(ctx, x0, y0, x1, y1, x2, y2, color0, color1, color2);
}
//////////////////////////////////////////////////////////////////////////
// swrDrawTriangle() implementations
//
typedef struct swr_edge
{
int32_t m_x0;
int32_t m_y0;
int32_t m_dx;
int32_t m_dy;
} swr_edge;
static inline swr_edge swr_edgeInit(int32_t x0, int32_t y0, int32_t x1, int32_t y1)
{
return (swr_edge){
.m_x0 = x0,
.m_y0 = y0,
.m_dx = (y1 - y0),
.m_dy = (x0 - x1),
};
}
static inline int32_t swr_edgeEval(swr_edge edge, int32_t x, int32_t y)
{
return 0
+ (x - edge.m_x0) * edge.m_dx
+ (y - edge.m_y0) * edge.m_dy
;
}
// Reference implementation
// https://fgiesen.wordpress.com/2013/02/08/triangle-rasterization-in-practice/
// NOTE: No fill rule used. All pixels lying ON an edge are drawn.
static void swrDrawTriangleRef(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t minX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t minY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t maxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t maxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxWidth = maxX - minX;
const int32_t bboxHeight = maxY - minY;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
const int32_t w0_pmin = swr_edgeEval(edge0, minX, minY);
const int32_t w1_pmin = swr_edgeEval(edge1, minX, minY);
const int32_t w2_pmin = swr_edgeEval(edge2, minX, minY);
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const float inv_area = 1.0f / (float)iarea;
#endif
// Rasterize
int32_t w0_row = w0_pmin;
int32_t w1_row = w1_pmin;
int32_t w2_row = w2_pmin;
uint32_t* fb_row = &ctx->m_FrameBuffer[minX + minY * ctx->m_Width];
for (int32_t py = 0; py <= bboxHeight; ++py) {
int32_t pxmin = 0;
int32_t pxmax = bboxWidth;
// Calculate the range of x values for which the barycentric coordinates
// will always be greater than or equal to 0.
{
// The barycentric coordinates are linear functions: w_pmin + i * w_px
//
// The inequality w_pmin + i * w_px >= 0 holds for all i's in the range:
// 1. w_pmin >= 0 && w_px >= 0 : [0, bboxWidth]
// 2. w_pmin >= 0 && w_px < 0 : [0, imax] where imax = -(w_pmin / w_px)
// 3. w_pmin < 0 && w_px > 0 : [imin, bboxWidth] where imin = -(w_pmin / w_px) + 1
// 4. w_pmin < 0 && w_px <= 0 : never
//
// From the 3 barycentric coordinates we have 3 equations. All of them
// should be greater than or equal to 0 to draw a pixel.
// Make sure we aren't in an invalid state.
assert(!(w0_row < 0 && edge0.m_dx <= 0));
assert(!(w1_row < 0 && edge1.m_dx <= 0));
assert(!(w2_row < 0 && edge2.m_dx <= 0));
// Calculate x range based on w0...
if (w0_row >= 0 && edge0.m_dx < 0) {
pxmax = swr_mini(pxmax, -(w0_row / edge0.m_dx));
} else if (w0_row < 0 && edge0.m_dx > 0) {
pxmin = swr_maxi(pxmin, (-w0_row / edge0.m_dx) + ((-w0_row % edge0.m_dx) != 0 ? 1 : 0));
}
// Calculate x range based on w1...
if (w1_row >= 0 && edge1.m_dx < 0) {
pxmax = swr_mini(pxmax, -(w1_row / edge1.m_dx));
} else if (w1_row < 0 && edge1.m_dx > 0) {
pxmin = swr_maxi(pxmin, (-w1_row / edge1.m_dx) + ((-w1_row % edge1.m_dx) != 0 ? 1 : 0));
}
// Calculate x range based on w2...
if (w2_row >= 0 && edge2.m_dx < 0) {
pxmax = swr_mini(pxmax, -(w2_row / edge2.m_dx));
} else if (w2_row < 0 && edge2.m_dx > 0) {
pxmin = swr_maxi(pxmin, (-w2_row / edge2.m_dx) + ((-w2_row % edge2.m_dx) != 0 ? 1 : 0));
}
}
// Calculate barycentric coords at pxmin
int32_t w0 = w0_row + pxmin * edge0.m_dx;
int32_t w1 = w1_row + pxmin * edge1.m_dx;
int32_t w2 = w2_row + pxmin * edge2.m_dx;
for (int32_t px = pxmin; px <= pxmax; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
w0_row += edge0.m_dy;
w1_row += edge1.m_dy;
w2_row += edge2.m_dy;
fb_row += ctx->m_Width;
}
}
static const uint32_t kBlockSize = 4;
#define SWR_CONFIG_USE_POSITIVE_MASKS 0
// Example 4x1 row:
// *---*---*---*---*
// | A | | | B |
// *---*---*---*---*
//
// A = (blockMinX, y)
// B = (blockMaxX, y)
//
// Case | A | B | result
// -----|---|---|----------------------------------
// 00 | - | - | not covered
// 01 | - | + | partially covered, [xmin, blockMaxX]
// 10 | + | - | partially covered, [blockMinX, xmax]
// 11 | + | + | fully covered
#define SWR_ROW_MASK_A_Pos 1
#define SWR_ROW_MASK_A_Msk (0x01 << SWR_ROW_MASK_A_Pos)
#define SWR_ROW_MASK_B_Pos 0
#define SWR_ROW_MASK_B_Msk (0x01 << SWR_ROW_MASK_B_Pos)
#if SWR_CONFIG_USE_POSITIVE_MASKS
#define SWR_ROW_MASK(wA, wB) (0 \
| ((wA) >= 0 ? SWR_ROW_MASK_A_Msk : 0x00) \
| ((wB) >= 0 ? SWR_ROW_MASK_B_Msk : 0x00) \
)
#define SWR_ROW_MASK_EMPTY SWR_ROW_MASK(-1, -1)
#define SWR_ROW_MASK_FULL SWR_ROW_MASK(1, 1)
#define SWR_ROW_MASK_X_MIN(msk) (((msk) & SWR_ROW_MASK_A_Msk) == 0)
#define SWR_ROW_MASK_X_MAX(msk) (((msk) & SWR_ROW_MASK_B_Msk) == 0)
#define SWR_ROW_MASK_ANY_EMPTY(msk0, msk1, msk2) ((((msk0) == SWR_ROW_MASK_EMPTY) || ((msk1) == SWR_ROW_MASK_EMPTY) || ((msk2) == SWR_ROW_MASK_EMPTY)))
#define SWR_ROW_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) & (msk1) & (msk2)) == SWR_ROW_MASK_FULL)
#else
#define SWR_ROW_MASK(wA, wB) (0 \
| ((wA) < 0 ? SWR_ROW_MASK_A_Msk : 0x00) \
| ((wB) < 0 ? SWR_ROW_MASK_B_Msk : 0x00) \
)
#define SWR_ROW_MASK_EMPTY SWR_ROW_MASK(-1, -1)
#define SWR_ROW_MASK_FULL SWR_ROW_MASK(1, 1)
#define SWR_ROW_MASK_X_MIN(msk) (((msk) & SWR_ROW_MASK_A_Msk) == SWR_ROW_MASK_A_Msk)
#define SWR_ROW_MASK_X_MAX(msk) (((msk) & SWR_ROW_MASK_B_Msk) == SWR_ROW_MASK_B_Msk)
#define SWR_ROW_MASK_ANY_EMPTY(msk0, msk1, msk2) ((((msk0) == SWR_ROW_MASK_EMPTY) || ((msk1) == SWR_ROW_MASK_EMPTY) || ((msk2) == SWR_ROW_MASK_EMPTY)))
#define SWR_ROW_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) | (msk1) | (msk2)) == SWR_ROW_MASK_FULL)
#endif
// Example 4x4 block:
//
// *---*---*---*---*
// | A | | | B |
// *---*---*---*---*
// | | | | |
// *---*---*---*---*
// | | | | |
// *---*---*---*---*
// | D | | | C |
// *---*---*---*---*
//
// A = (blockMinX, blockMinY)
// B = (blockMaxX, blockMinY)
// C = (blockMaxX, blockMaxY)
// D = (blockMinX, blockMaxY)
//
// Case | A | B | C | D | result
// -----|---|---|---|---|----------------------------------
// 0000 | - | - | - | - | not covered
// 0001 | - | - | - | + | partially covered, [ymin, blockMaxY]
// 0010 | - | - | + | - | partially covered, [ymin, blockMaxY]
// 0011 | - | - | + | + | partially covered, [ymin, blockMaxY]
// 0100 | - | + | - | - | partially covered, [blockMinY, ymax]
// 0101 | - | + | - | + | invalid configuration
// 0110 | - | + | + | - | partially covered, [blockMinY, blockMaxY]
// 0111 | - | + | + | + | partially covered, [blockMinY, blockMaxY]
// 1000 | + | - | - | - | partially covered, [blockMinY, ymax]
// 1001 | + | - | - | + | partially covered, [blockMinY, blockMaxY]
// 1010 | + | - | + | - | invalid configuration
// 1011 | + | - | + | + | partially covered, [blockMinY, blockMaxY]
// 1100 | + | + | - | - | partially covered, [blockminY, ymax]
// 1101 | + | + | - | + | partially covered, [blockMinY, blockMaxY]
// 1110 | + | + | + | - | partially covered, [blockMinY, blockMaxY]
// 1111 | + | + | + | + | fully covered
#define SWR_BLOCK_MASK_A_Pos 3
#define SWR_BLOCK_MASK_A_Msk (0x01 << SWR_BLOCK_MASK_A_Pos)
#define SWR_BLOCK_MASK_B_Pos 2
#define SWR_BLOCK_MASK_B_Msk (0x01 << SWR_BLOCK_MASK_B_Pos)
#define SWR_BLOCK_MASK_C_Pos 1
#define SWR_BLOCK_MASK_C_Msk (0x01 << SWR_BLOCK_MASK_C_Pos)
#define SWR_BLOCK_MASK_D_Pos 0
#define SWR_BLOCK_MASK_D_Msk (0x01 << SWR_BLOCK_MASK_D_Pos)
#if SWR_CONFIG_USE_POSITIVE_MASKS
#define SWR_BLOCK_MASK(wA, wB, wC, wD) (0 \
| ((wA) >= 0 ? SWR_BLOCK_MASK_A_Msk : 0x00) \
| ((wB) >= 0 ? SWR_BLOCK_MASK_B_Msk : 0x00) \
| ((wC) >= 0 ? SWR_BLOCK_MASK_C_Msk : 0x00) \
| ((wD) >= 0 ? SWR_BLOCK_MASK_D_Msk : 0x00) \
)
#define SWR_BLOCK_MASK_EMPTY SWR_BLOCK_MASK(-1, -1, -1, -1)
#define SWR_BLOCK_MASK_FULL SWR_BLOCK_MASK(1, 1, 1, 1)
#define SWR_BLOCK_MASK_IS_VALID(msk) (((msk) != SWR_BLOCK_MASK(-1, 1, -1, 1)) && ((msk) != SWR_BLOCK_MASK(1, -1, 1, -1)))
#define SWR_BLOCK_MASK_Y_MIN(msk) (((msk) & (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) == 0)
#define SWR_BLOCK_MASK_Y_MAX(msk) (((msk) & (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) == 0)
#define SWR_BLOCK_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) & (msk1) & (msk2)) == SWR_BLOCK_MASK_FULL)
#else
#define SWR_BLOCK_MASK(wA, wB, wC, wD) (0 \
| ((wA) < 0 ? SWR_BLOCK_MASK_A_Msk : 0x00) \
| ((wB) < 0 ? SWR_BLOCK_MASK_B_Msk : 0x00) \
| ((wC) < 0 ? SWR_BLOCK_MASK_C_Msk : 0x00) \
| ((wD) < 0 ? SWR_BLOCK_MASK_D_Msk : 0x00) \
)
#define SWR_BLOCK_MASK_EMPTY SWR_BLOCK_MASK(-1, -1, -1, -1)
#define SWR_BLOCK_MASK_FULL SWR_BLOCK_MASK(1, 1, 1, 1)
#define SWR_BLOCK_MASK_IS_VALID(msk) (((msk) != SWR_BLOCK_MASK(-1, 1, -1, 1)) && ((msk) != SWR_BLOCK_MASK(1, -1, 1, -1)))
#define SWR_BLOCK_MASK_Y_MIN(msk) (((msk) & (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) == (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk))
#define SWR_BLOCK_MASK_Y_MAX(msk) (((msk) & (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) == (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk))
#define SWR_BLOCK_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) | (msk1) | (msk2)) == SWR_BLOCK_MASK_FULL)
#endif
static void swrDrawTriangleRef_Hierarchical(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const float inv_area = 1.0f / (float)iarea;
#endif
// Rasterize
for (int32_t blockMinY = bboxMinY_aligned, blockMaxY = bboxMinY_aligned + kBlockSize - 1;
blockMinY < bboxMaxY;
blockMinY += kBlockSize, blockMaxY += kBlockSize) {
for (int32_t blockMinX = bboxMinX_aligned, blockMaxX = bboxMinX_aligned + kBlockSize - 1;
blockMinX < bboxMaxX;
blockMinX += kBlockSize, blockMaxX += kBlockSize) {
// Evaluate 1st edge function at the 4 block corners. If all of the signed
// distances are negative (all sign bits are 1) then the block will be empty.
const int32_t w0_A = swr_edgeEval(edge0, blockMinX, blockMinY);
const int32_t w0_B = swr_edgeEval(edge0, blockMaxX, blockMinY);
const int32_t w0_C = swr_edgeEval(edge0, blockMaxX, blockMaxY);
const int32_t w0_D = swr_edgeEval(edge0, blockMinX, blockMaxY);
const uint32_t w0_blockMsk = SWR_BLOCK_MASK(w0_A, w0_B, w0_C, w0_D);
assert(SWR_BLOCK_MASK_IS_VALID(w0_blockMsk));
if (w0_blockMsk == SWR_BLOCK_MASK_EMPTY) {
continue;
}
// Evaluate 2nd edge function at the 4 block corners. If all of the signed
// distances are negative (all sign bits are 1) then the block will be empty.
const int32_t w1_A = swr_edgeEval(edge1, blockMinX, blockMinY);
const int32_t w1_B = swr_edgeEval(edge1, blockMaxX, blockMinY);
const int32_t w1_C = swr_edgeEval(edge1, blockMaxX, blockMaxY);
const int32_t w1_D = swr_edgeEval(edge1, blockMinX, blockMaxY);
const uint32_t w1_blockMsk = SWR_BLOCK_MASK(w1_A, w1_B, w1_C, w1_D);
assert(SWR_BLOCK_MASK_IS_VALID(w1_blockMsk));
if (w1_blockMsk == SWR_BLOCK_MASK_EMPTY) {
continue;
}
// Evaluate 3rd edge function at the 4 block corners. If all of the signed
// distances are negative (all sign bits are 1) then the block will be empty.
const int32_t w2_A = swr_edgeEval(edge2, blockMinX, blockMinY);
const int32_t w2_B = swr_edgeEval(edge2, blockMaxX, blockMinY);
const int32_t w2_C = swr_edgeEval(edge2, blockMaxX, blockMaxY);
const int32_t w2_D = swr_edgeEval(edge2, blockMinX, blockMaxY);
const uint32_t w2_blockMsk = SWR_BLOCK_MASK(w2_A, w2_B, w2_C, w2_D);
assert(SWR_BLOCK_MASK_IS_VALID(w2_blockMsk));
if (w2_blockMsk == SWR_BLOCK_MASK_EMPTY) {
continue;
}
if (!SWR_BLOCK_MASK_ALL_FULL(w0_blockMsk, w1_blockMsk, w2_blockMsk)) {
// Partial block
int32_t pymin = 0;
int32_t pymax = kBlockSize - 1;
{
if (SWR_BLOCK_MASK_Y_MAX(w0_blockMsk)) {
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
pymax = swr_mini(pymax, w_pymax);
} else if (SWR_BLOCK_MASK_Y_MIN(w0_blockMsk)) {
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
pymin = swr_maxi(pymin, w_pymin);
}
if (SWR_BLOCK_MASK_Y_MAX(w1_blockMsk)) {
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
pymax = swr_mini(pymax, w_pymax);
} else if (SWR_BLOCK_MASK_Y_MIN(w1_blockMsk)) {
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
pymin = swr_maxi(pymin, w_pymin);
}
if (SWR_BLOCK_MASK_Y_MAX(w2_blockMsk)) {
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
pymax = swr_mini(pymax, w_pymax);
} else if (SWR_BLOCK_MASK_Y_MIN(w2_blockMsk)) {
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
pymin = swr_maxi(pymin, w_pymin);
}
}
// Evaluate edge functions at the first row.
int32_t w0_blockMinX_py = swr_edgeEval(edge0, blockMinX, blockMinY + pymin);
int32_t w1_blockMinX_py = swr_edgeEval(edge1, blockMinX, blockMinY + pymin);
int32_t w2_blockMinX_py = swr_edgeEval(edge2, blockMinX, blockMinY + pymin);
int32_t w0_blockMaxX_py = w0_blockMinX_py + edge0.m_dx * (kBlockSize - 1);
int32_t w1_blockMaxX_py = w1_blockMinX_py + edge1.m_dx * (kBlockSize - 1);
int32_t w2_blockMaxX_py = w2_blockMinX_py + edge2.m_dx * (kBlockSize - 1);
for (int32_t py = pymin; py <= pymax; ++py) {
const int32_t w0_rowMsk = SWR_ROW_MASK(w0_blockMinX_py, w0_blockMaxX_py);
const int32_t w1_rowMsk = SWR_ROW_MASK(w1_blockMinX_py, w1_blockMaxX_py);
const int32_t w2_rowMsk = SWR_ROW_MASK(w2_blockMinX_py, w2_blockMaxX_py);
assert(w0_rowMsk != SWR_ROW_MASK_EMPTY);
assert(w1_rowMsk != SWR_ROW_MASK_EMPTY);
assert(w2_rowMsk != SWR_ROW_MASK_EMPTY);
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + (blockMinY + py) * ctx->m_Width];
int32_t pxmin = 0;
int32_t pxmax = (int32_t)kBlockSize - 1;
if (!SWR_ROW_MASK_ALL_FULL(w0_rowMsk, w1_rowMsk, w2_rowMsk)) {
if (SWR_ROW_MASK_X_MAX(w0_rowMsk)) {
const int32_t w_pxmax = swr_idiv_floor(w0_blockMinX_py, -edge0.m_dx);
pxmax = swr_mini(pxmax, w_pxmax);
} else if (SWR_ROW_MASK_X_MIN(w0_rowMsk)) {
const int32_t w_pxmin = swr_idiv_ceil(-w0_blockMinX_py, edge0.m_dx);
pxmin = swr_maxi(pxmin, w_pxmin);
}
if (SWR_ROW_MASK_X_MAX(w1_rowMsk)) {
const int32_t w_pxmax = swr_idiv_floor(w1_blockMinX_py, -edge1.m_dx);
pxmax = swr_mini(pxmax, w_pxmax);
} else if (SWR_ROW_MASK_X_MIN(w1_rowMsk)) {
const int32_t w_pxmin = swr_idiv_ceil(-w1_blockMinX_py, edge1.m_dx);
pxmin = swr_maxi(pxmin, w_pxmin);
}
if (SWR_ROW_MASK_X_MAX(w2_rowMsk)) {
const int32_t w_pxmax = swr_idiv_floor(w2_blockMinX_py, -edge2.m_dx);
pxmax = swr_mini(pxmax, w_pxmax);
} else if (SWR_ROW_MASK_X_MIN(w2_rowMsk)) {
const int32_t w_pxmin = swr_idiv_ceil(-w2_blockMinX_py, edge2.m_dx);
pxmin = swr_maxi(pxmin, w_pxmin);
}
}
// Calculate barycentric coords at pxmin
int32_t w0 = w0_blockMinX_py + pxmin * edge0.m_dx;
int32_t w1 = w1_blockMinX_py + pxmin * edge1.m_dx;
int32_t w2 = w2_blockMinX_py + pxmin * edge2.m_dx;
for (int32_t px = pxmin; px <= pxmax; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
w0_blockMinX_py += edge0.m_dy;
w1_blockMinX_py += edge1.m_dy;
w2_blockMinX_py += edge2.m_dy;
w0_blockMaxX_py += edge0.m_dy;
w1_blockMaxX_py += edge1.m_dy;
w2_blockMaxX_py += edge2.m_dy;
}
} else {
// Full block
int32_t w0_row = w0_A;
int32_t w1_row = w1_A;
int32_t w2_row = w2_A;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
// Calculate barycentric coords at pxmin
int32_t w0 = w0_row;
int32_t w1 = w1_row;
int32_t w2 = w2_row;
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
w0_row += edge0.m_dy;
w1_row += edge1.m_dy;
w2_row += edge2.m_dy;
fb_row += ctx->m_Width;
}
}
}
}
}
#define SWR_ANY_NEGATIVE3(a, b, c) (((a) | (b) | (c)) < 0)
// 2-level hierarchical rasterization using trivial reject/accept corners.
//
// Fully covered blocks are rasterized without any conditionals in the inner loops.
//
// Partially covered blocks are rasterized conditionally by keeping track of the
// edge function values at each block row's min. Only completely uncovered rows
// are skipped.
static void swrDrawTriangleRef_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const float inv_area = 1.0f / (float)iarea;
#endif
// Trivial reject/accept corner offsets relative to block min/max.
const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1);
const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1);
const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1);
const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1);
const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1);
const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1);
const int32_t trivialRejectOffset0 = 0
+ (edge0.m_dx >= 0 ? w0_blockMax_dx : 0)
+ (edge0.m_dy >= 0 ? w0_blockMax_dy : 0)
;
const int32_t trivialRejectOffset1 = 0
+ (edge1.m_dx >= 0 ? w1_blockMax_dx : 0)
+ (edge1.m_dy >= 0 ? w1_blockMax_dy : 0)
;
const int32_t trivialRejectOffset2 = 0
+ (edge2.m_dx >= 0 ? w2_blockMax_dx : 0)
+ (edge2.m_dy >= 0 ? w2_blockMax_dy : 0)
;
const int32_t trivialAcceptOffset0 = (w0_blockMax_dx + w0_blockMax_dy) - trivialRejectOffset0;
const int32_t trivialAcceptOffset1 = (w1_blockMax_dx + w1_blockMax_dy) - trivialRejectOffset1;
const int32_t trivialAcceptOffset2 = (w2_blockMax_dx + w2_blockMax_dy) - trivialRejectOffset2;
const int32_t trivialRejectOffset0_dx = 0
+ (edge0.m_dx >= 0 ? w0_blockMax_dx : 0)
;
const int32_t trivialRejectOffset1_dx = 0
+ (edge1.m_dx >= 0 ? w1_blockMax_dx : 0)
;
const int32_t trivialRejectOffset2_dx = 0
+ (edge2.m_dx >= 0 ? w2_blockMax_dx : 0)
;
// Rasterize
const int32_t w0_bboxMin = swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned);
const int32_t w1_bboxMin = swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned);
const int32_t w2_bboxMin = swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned);
const int32_t w0_nextBlock_dx = edge0.m_dx * kBlockSize;
const int32_t w0_nextBlock_dy = edge0.m_dy * kBlockSize;
const int32_t w1_nextBlock_dx = edge1.m_dx * kBlockSize;
const int32_t w1_nextBlock_dy = edge1.m_dy * kBlockSize;
const int32_t w2_nextBlock_dx = edge2.m_dx * kBlockSize;
const int32_t w2_nextBlock_dy = edge2.m_dy * kBlockSize;
int32_t w0_blockY = w0_bboxMin;
int32_t w1_blockY = w1_bboxMin;
int32_t w2_blockY = w2_bboxMin;
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
int32_t w0_blockMin = w0_blockY;
int32_t w1_blockMin = w1_blockY;
int32_t w2_blockMin = w2_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const int32_t w0_trivialReject = w0_blockMin + trivialRejectOffset0;
const int32_t w1_trivialReject = w1_blockMin + trivialRejectOffset1;
const int32_t w2_trivialReject = w2_blockMin + trivialRejectOffset2;
if (SWR_ANY_NEGATIVE3(w0_trivialReject, w1_trivialReject, w2_trivialReject)) {
w0_blockMin += w0_nextBlock_dx;
w1_blockMin += w1_nextBlock_dx;
w2_blockMin += w2_nextBlock_dx;
continue;
}
// At this point we know that the triangle touches the tile. There are 2 cases:
// - The tile is fully covered by the triangle.
// - The tile is partially covered by the triangle.
//
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
//
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// The trivial accept corner is the opposite corner to the trivial reject corner.
// If all trivial accept corners are inside their respective edges then the block is fully
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
//
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
// the block's max point.
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
// (trivial accept) corner:
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
//
const int32_t w0_trivialAccept = w0_blockMin + trivialAcceptOffset0;
const int32_t w1_trivialAccept = w1_blockMin + trivialAcceptOffset1;
const int32_t w2_trivialAccept = w2_blockMin + trivialAcceptOffset2;
if (SWR_ANY_NEGATIVE3(w0_trivialAccept, w1_trivialAccept, w2_trivialAccept)) {
// Partial block
int32_t w0_row = w0_blockMin;
int32_t w1_row = w1_blockMin;
int32_t w2_row = w2_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
const int32_t w0_rowTrivialReject = w0_row + trivialRejectOffset0_dx;
const int32_t w1_rowTrivialReject = w1_row + trivialRejectOffset1_dx;
const int32_t w2_rowTrivialReject = w2_row + trivialRejectOffset2_dx;
if (!SWR_ANY_NEGATIVE3(w0_rowTrivialReject, w1_rowTrivialReject, w2_rowTrivialReject)) {
int32_t w0 = w0_row;
int32_t w1 = w1_row;
int32_t w2 = w2_row;
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
if (!SWR_ANY_NEGATIVE3(w0, w1, w2)) {
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
}
w0_row += edge0.m_dy;
w1_row += edge1.m_dy;
w2_row += edge2.m_dy;
fb_row += ctx->m_Width;
}
} else {
// Full block
int32_t w0_row = w0_blockMin;
int32_t w1_row = w1_blockMin;
int32_t w2_row = w2_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
// Calculate barycentric coords at pxmin
int32_t w0 = w0_row;
int32_t w1 = w1_row;
int32_t w2 = w2_row;
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
w0_row += edge0.m_dy;
w1_row += edge1.m_dy;
w2_row += edge2.m_dy;
fb_row += ctx->m_Width;
}
}
w0_blockMin += w0_nextBlock_dx;
w1_blockMin += w1_nextBlock_dx;
w2_blockMin += w2_nextBlock_dx;
}
w0_blockY += w0_nextBlock_dy;
w1_blockY += w1_nextBlock_dy;
w2_blockY += w2_nextBlock_dy;
}
}
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
const vec4f v_r2 = vec4f_getXXXX(v_c2);
const vec4f v_g2 = vec4f_getYYYY(v_c2);
const vec4f v_b2 = vec4f_getZZZZ(v_c2);
const vec4f v_a2 = vec4f_getWWWW(v_c2);
const vec4f v_dr02 = vec4f_getXXXX(v_c02);
const vec4f v_dg02 = vec4f_getYYYY(v_c02);
const vec4f v_db02 = vec4f_getZZZZ(v_c02);
const vec4f v_da02 = vec4f_getWWWW(v_c02);
const vec4f v_dr12 = vec4f_getXXXX(v_c12);
const vec4f v_dg12 = vec4f_getYYYY(v_c12);
const vec4f v_db12 = vec4f_getZZZZ(v_c12);
const vec4f v_da12 = vec4f_getWWWW(v_c12);
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Trivial reject/accept corner offsets relative to block min/max.
const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1);
const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1);
const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1);
const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1);
const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1);
const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1);
const vec4i v_trivialRejectOffset = vec4i_fromInt4(
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0) + (edge0.m_dy >= 0 ? w0_blockMax_dy : 0),
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0) + (edge1.m_dy >= 0 ? w1_blockMax_dy : 0),
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0) + (edge2.m_dy >= 0 ? w2_blockMax_dy : 0),
0
);
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_fromInt4(
(w0_blockMax_dx + w0_blockMax_dy),
(w1_blockMax_dx + w1_blockMax_dy),
(w2_blockMax_dx + w2_blockMax_dy),
0), v_trivialRejectOffset
);
const vec4i v_trivialRejectOffset_dx = vec4i_fromInt4(
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0),
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0),
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0),
0
);
const vec4i v_edge0_dx0123 = vec4i_fromInt4(0, edge0.m_dx, edge0.m_dx * 2, edge0.m_dx * 3);
const vec4i v_edge1_dx0123 = vec4i_fromInt4(0, edge1.m_dx, edge1.m_dx * 2, edge1.m_dx * 3);
const vec4i v_edge2_dx0123 = vec4i_fromInt4(0, edge2.m_dx, edge2.m_dx * 2, edge2.m_dx * 3);
const vec4i v_edge0_dx4 = vec4i_fromInt(edge0.m_dx * 4);
const vec4i v_edge1_dx4 = vec4i_fromInt(edge1.m_dx * 4);
const vec4i v_edge2_dx4 = vec4i_fromInt(edge2.m_dx * 4);
const vec4i v_edge012__dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);
// Rasterize
const vec4i v_w_bboxMin = vec4i_fromInt4(
swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned),
swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned),
swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned),
0
);
const vec4i v_w_nextBlock_dx = vec4i_fromInt4(edge0.m_dx * kBlockSize, edge1.m_dx * kBlockSize, edge2.m_dx * kBlockSize, 0);
const vec4i v_w_nextBlock_dy = vec4i_fromInt4(edge0.m_dy * kBlockSize, edge1.m_dy * kBlockSize, edge2.m_dy * kBlockSize, 0);
vec4i v_w_blockY = v_w_bboxMin;
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
vec4i v_w_blockMin = v_w_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset);
if (vec4i_any_neg_SSE2(v_w_trivialReject)) {
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
continue;
}
// At this point we know that the triangle touches the tile. There are 2 cases:
// - The tile is fully covered by the triangle.
// - The tile is partially covered by the triangle.
//
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
//
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// The trivial accept corner is the opposite corner to the trivial reject corner.
// If all trivial accept corners are inside their respective edges then the block is fully
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
//
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
// the block's max point.
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
// (trivial accept) corner:
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
//
const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset);
if (vec4i_any_neg_SSE2(v_w_trivialAccept)) {
// Partial block
vec4i v_w_row = v_w_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
const vec4i v_w_rowTrivialReject = vec4i_add(v_w_row, v_trivialRejectOffset_dx);
if (!vec4i_any_neg_SSE2(v_w_rowTrivialReject))
{
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
// Calculate the (inverse) pixel mask.
// If any of the barycentric coordinates is negative, the pixel mask will
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
// to blend between the existing framebuffer values and the new values.
const vec4i v_izero = vec4i_zero();
const vec4i v_w0_lt = vec4i_cmplt(v_w0, v_izero);
const vec4i v_w1_lt = vec4i_cmplt(v_w1, v_izero);
const vec4i v_w2_lt = vec4i_cmplt(v_w2, v_izero);
const vec4i v_notPixelMask = vec4i_or(v_w0_lt, vec4i_or(v_w1_lt, v_w2_lt));
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
// Pack independent R32/G32/B32/A32 values of the 4 pixels into RGBA8.
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
// Store result using the pixel mask
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[px]);
}
v_w0 = vec4i_add(v_w0, v_edge0_dx4);
v_w1 = vec4i_add(v_w1, v_edge1_dx4);
v_w2 = vec4i_add(v_w2, v_edge2_dx4);
}
}
v_w_row = vec4i_add(v_w_row, v_edge012__dy);
fb_row += ctx->m_Width;
}
} else {
// Full block
vec4i v_w_row = v_w_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
// Calculate barycentric coords at pxmin
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va(v_rgba8, &fb_row[px]);
}
v_w0 = vec4i_add(v_w0, v_edge0_dx4);
v_w1 = vec4i_add(v_w1, v_edge1_dx4);
v_w2 = vec4i_add(v_w2, v_edge2_dx4);
}
v_w_row = vec4i_add(v_w_row, v_edge012__dy);
fb_row += ctx->m_Width;
}
}
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
}
v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy);
}
}
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 4);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 4);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
const vec4f v_r2 = vec4f_getXXXX(v_c2);
const vec4f v_g2 = vec4f_getYYYY(v_c2);
const vec4f v_b2 = vec4f_getZZZZ(v_c2);
const vec4f v_a2 = vec4f_getWWWW(v_c2);
const vec4f v_dr02 = vec4f_getXXXX(v_c02);
const vec4f v_dg02 = vec4f_getYYYY(v_c02);
const vec4f v_db02 = vec4f_getZZZZ(v_c02);
const vec4f v_da02 = vec4f_getWWWW(v_c02);
const vec4f v_dr12 = vec4f_getXXXX(v_c12);
const vec4f v_dg12 = vec4f_getYYYY(v_c12);
const vec4f v_db12 = vec4f_getZZZZ(v_c12);
const vec4f v_da12 = vec4f_getWWWW(v_c12);
// Barycentric coordinate normalization
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Trivial reject/accept corner offsets relative to block min/max.
const vec4i v_zero = vec4i_zero();
const vec4i v_blockSize = vec4i_fromInt(4);
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);
const vec4i v_trivialRejectOffset = vec4i_add(
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
);
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);
// Rasterize
const vec4i v_w_bboxMin = vec4i_fromInt4(
swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned),
swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned),
swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned),
0
);
const vec4i v_w_nextBlock_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize);
const vec4i v_w_nextBlock_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize);
vec4i v_w_blockY = v_w_bboxMin;
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += 4) {
vec4i v_w_blockMin = v_w_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += 4) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset);
if (vec4i_any_neg_SSE2(v_w_trivialReject)) {
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
continue;
}
// At this point we know that the triangle touches the tile. There are 2 cases:
// - The tile is fully covered by the triangle.
// - The tile is partially covered by the triangle.
//
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
//
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// The trivial accept corner is the opposite corner to the trivial reject corner.
// If all trivial accept corners are inside their respective edges then the block is fully
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
//
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
// the block's max point.
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
// (trivial accept) corner:
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
//
const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset);
if (vec4i_any_neg_SSE2(v_w_trivialAccept)) {
// Partial block
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
vec4i v_w0_row0 = vec4i_add(vec4i_getXXXX(v_w_blockMin), v_edge0_dx0123);
vec4i v_w1_row0 = vec4i_add(vec4i_getYYYY(v_w_blockMin), v_edge1_dx0123);
vec4i v_w2_row0 = vec4i_add(vec4i_getZZZZ(v_w_blockMin), v_edge2_dx0123);
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);
// Calculate the (inverse) pixel mask.
// If any of the barycentric coordinates is negative, the pixel mask will
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
// to blend between the existing framebuffer values and the new values.
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);
if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]);
}
if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
}
if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
}
if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
}
} else {
#if 1
// Full block
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_w0_row0 = vec4i_add(vec4i_getXXXX(v_w_blockMin), v_edge0_dx0123);
const vec4i v_w1_row0 = vec4i_add(vec4i_getYYYY(v_w_blockMin), v_edge1_dx0123);
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
#endif
}
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
}
v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy);
}
}
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
const vec4f v_r2 = vec4f_getXXXX(v_c2);
const vec4f v_g2 = vec4f_getYYYY(v_c2);
const vec4f v_b2 = vec4f_getZZZZ(v_c2);
const vec4f v_a2 = vec4f_getWWWW(v_c2);
const vec4f v_dr02 = vec4f_getXXXX(v_c02);
const vec4f v_dg02 = vec4f_getYYYY(v_c02);
const vec4f v_db02 = vec4f_getZZZZ(v_c02);
const vec4f v_da02 = vec4f_getWWWW(v_c02);
const vec4f v_dr12 = vec4f_getXXXX(v_c12);
const vec4f v_dg12 = vec4f_getYYYY(v_c12);
const vec4f v_db12 = vec4f_getZZZZ(v_c12);
const vec4f v_da12 = vec4f_getWWWW(v_c12);
// Barycentric coordinate normalization
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Trivial reject/accept corner offsets relative to block min/max.
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);
const vec4i v_zero = vec4i_zero();
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);
const vec4i v_trivialRejectOffset = vec4i_add(
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
);
const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset);
const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset);
const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset);
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset);
const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset);
const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset);
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);
// Rasterize
const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16);
const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16);
const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16);
const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4);
const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4);
const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4);
const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12);
vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets));
vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets));
vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets));
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) {
uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width];
vec4i v_w0_blockMin = v_w0_blockY;
vec4i v_w1_blockMin = v_w1_blockY;
vec4i v_w2_blockMin = v_w2_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0);
const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1);
const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2);
const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject);
uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F;
if (trivialRejectBlockMask == 0) {
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
continue;
}
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// If the trivial accept corner of all edges is positive (inside the edge) then the triangle
// fully covers the block.
const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0);
const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1);
const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2);
const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept);
uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept);
int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4];
vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]);
vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]);
vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]);
for (uint32_t iBlock = 0;trivialRejectBlockMask != 0;
++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1)
{
if ((trivialRejectBlockMask & 1) == 0) {
continue;
}
uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4];
vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123);
vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123);
if ((trivialAcceptBlockMask & 1) != 0) {
// Partial block
vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123);
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);
// Calculate the (inverse) pixel mask.
// If any of the barycentric coordinates is negative, the pixel mask will
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
// to blend between the existing framebuffer values and the new values.
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);
// Row 0
if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]);
}
// Row 1
if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
}
// Row 2
if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
}
// Row 3
if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
}
} else {
// Full block
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif
// Row 0
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
}
// Row 1
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
}
// Row 2
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
}
// Row 3
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
}
}
}
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
}
v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy);
v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy);
v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy);
}
}
// Same as the corresponding SSE2 version except from the usage of pshufb
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
const vec4f v_r2 = vec4f_getXXXX(v_c2);
const vec4f v_g2 = vec4f_getYYYY(v_c2);
const vec4f v_b2 = vec4f_getZZZZ(v_c2);
const vec4f v_a2 = vec4f_getWWWW(v_c2);
const vec4f v_dr02 = vec4f_getXXXX(v_c02);
const vec4f v_dg02 = vec4f_getYYYY(v_c02);
const vec4f v_db02 = vec4f_getZZZZ(v_c02);
const vec4f v_da02 = vec4f_getWWWW(v_c02);
const vec4f v_dr12 = vec4f_getXXXX(v_c12);
const vec4f v_dg12 = vec4f_getYYYY(v_c12);
const vec4f v_db12 = vec4f_getZZZZ(v_c12);
const vec4f v_da12 = vec4f_getWWWW(v_c12);
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Trivial reject/accept corner offsets relative to block min/max.
const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1);
const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1);
const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1);
const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1);
const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1);
const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1);
const vec4i v_trivialRejectOffset = vec4i_fromInt4(
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0) + (edge0.m_dy >= 0 ? w0_blockMax_dy : 0),
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0) + (edge1.m_dy >= 0 ? w1_blockMax_dy : 0),
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0) + (edge2.m_dy >= 0 ? w2_blockMax_dy : 0),
0
);
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_fromInt4(
(w0_blockMax_dx + w0_blockMax_dy),
(w1_blockMax_dx + w1_blockMax_dy),
(w2_blockMax_dx + w2_blockMax_dy),
0), v_trivialRejectOffset
);
const vec4i v_trivialRejectOffset_dx = vec4i_fromInt4(
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0),
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0),
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0),
0
);
const vec4i v_edge0_dx0123 = vec4i_fromInt4(0, edge0.m_dx, edge0.m_dx * 2, edge0.m_dx * 3);
const vec4i v_edge1_dx0123 = vec4i_fromInt4(0, edge1.m_dx, edge1.m_dx * 2, edge1.m_dx * 3);
const vec4i v_edge2_dx0123 = vec4i_fromInt4(0, edge2.m_dx, edge2.m_dx * 2, edge2.m_dx * 3);
const vec4i v_edge0_dx4 = vec4i_fromInt(edge0.m_dx * 4);
const vec4i v_edge1_dx4 = vec4i_fromInt(edge1.m_dx * 4);
const vec4i v_edge2_dx4 = vec4i_fromInt(edge2.m_dx * 4);
const vec4i v_edge012__dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);
// Rasterize
const vec4i v_w_bboxMin = vec4i_fromInt4(
swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned),
swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned),
swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned),
0
);
const vec4i v_w_nextBlock_dx = vec4i_fromInt4(edge0.m_dx * kBlockSize, edge1.m_dx * kBlockSize, edge2.m_dx * kBlockSize, 0);
const vec4i v_w_nextBlock_dy = vec4i_fromInt4(edge0.m_dy * kBlockSize, edge1.m_dy * kBlockSize, edge2.m_dy * kBlockSize, 0);
vec4i v_w_blockY = v_w_bboxMin;
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
vec4i v_w_blockMin = v_w_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset);
if (vec4i_any_neg_SSE2(v_w_trivialReject)) {
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
continue;
}
// At this point we know that the triangle touches the tile. There are 2 cases:
// - The tile is fully covered by the triangle.
// - The tile is partially covered by the triangle.
//
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
//
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// The trivial accept corner is the opposite corner to the trivial reject corner.
// If all trivial accept corners are inside their respective edges then the block is fully
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
//
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
// the block's max point.
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
// (trivial accept) corner:
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
//
const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset);
if (vec4i_any_neg_SSE2(v_w_trivialAccept)) {
// Partial block
vec4i v_w_row = v_w_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
const vec4i v_w_rowTrivialReject = vec4i_add(v_w_row, v_trivialRejectOffset_dx);
if (!vec4i_any_neg_SSE2(v_w_rowTrivialReject)) {
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
// Calculate the (inverse) pixel mask.
// If any of the barycentric coordinates is negative, the pixel mask will
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
// to blend between the existing framebuffer values and the new values.
const vec4i v_izero = vec4i_zero();
const vec4i v_w0_lt = vec4i_cmplt(v_w0, v_izero);
const vec4i v_w1_lt = vec4i_cmplt(v_w1, v_izero);
const vec4i v_w2_lt = vec4i_cmplt(v_w2, v_izero);
const vec4i v_notPixelMask = vec4i_or(v_w0_lt, vec4i_or(v_w1_lt, v_w2_lt));
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_oldFB = vec4i_fromInt4va(&fb_row[px]);
const vec4i v_newFB = vec4i_or(vec4i_and(v_notPixelMask, v_oldFB), vec4i_andnot(v_notPixelMask, vec4i_fromInt(-1)));
vec4i_toInt4va(v_newFB, &fb_row[px]);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
// Pack independent R32/G32/B32/A32 values of the 4 pixels into RGBA8.
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
// Store result using the pixel mask
const vec4i v_newFB = vec4i_or(
vec4i_and(v_notPixelMask, vec4i_fromInt4va(&fb_row[px])),
vec4i_andnot(v_notPixelMask, v_rgba8)
);
vec4i_toInt4va(v_newFB, &fb_row[px]);
#endif
}
v_w0 = vec4i_add(v_w0, v_edge0_dx4);
v_w1 = vec4i_add(v_w1, v_edge1_dx4);
v_w2 = vec4i_add(v_w2, v_edge2_dx4);
}
}
v_w_row = vec4i_add(v_w_row, v_edge012__dy);
fb_row += ctx->m_Width;
}
} else {
// Full block
vec4i v_w_row = v_w_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
// Calculate barycentric coords at pxmin
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123);
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123);
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123);
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
#if SWR_CONFIG_NO_PIXEL_SHADER
vec4i_toInt4va(vec4i_fromInt(-1), &fb_row[px]);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area);
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca);
vec4i_toInt4va(v_rgba8, &fb_row[px]);
#endif
}
v_w0 = vec4i_add(v_w0, v_edge0_dx4);
v_w1 = vec4i_add(v_w1, v_edge1_dx4);
v_w2 = vec4i_add(v_w2, v_edge2_dx4);
}
v_w_row = vec4i_add(v_w_row, v_edge012__dy);
fb_row += ctx->m_Width;
}
}
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx);
}
v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy);
}
}
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
const vec4f v_r2 = vec4f_getXXXX(v_c2);
const vec4f v_g2 = vec4f_getYYYY(v_c2);
const vec4f v_b2 = vec4f_getZZZZ(v_c2);
const vec4f v_a2 = vec4f_getWWWW(v_c2);
const vec4f v_dr02 = vec4f_getXXXX(v_c02);
const vec4f v_dg02 = vec4f_getYYYY(v_c02);
const vec4f v_db02 = vec4f_getZZZZ(v_c02);
const vec4f v_da02 = vec4f_getWWWW(v_c02);
const vec4f v_dr12 = vec4f_getXXXX(v_c12);
const vec4f v_dg12 = vec4f_getYYYY(v_c12);
const vec4f v_db12 = vec4f_getZZZZ(v_c12);
const vec4f v_da12 = vec4f_getWWWW(v_c12);
// Barycentric coordinate normalization
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Trivial reject/accept corner offsets relative to block min/max.
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);
const vec4i v_zero = vec4i_zero();
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);
const vec4i v_trivialRejectOffset = vec4i_add(
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
);
const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset);
const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset);
const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset);
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset);
const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset);
const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset);
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);
// Rasterize
const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16);
const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16);
const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16);
const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4);
const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4);
const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4);
const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12);
vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets));
vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets));
vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets));
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) {
uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width];
vec4i v_w0_blockMin = v_w0_blockY;
vec4i v_w1_blockMin = v_w1_blockY;
vec4i v_w2_blockMin = v_w2_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0);
const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1);
const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2);
const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject);
uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F;
if (trivialRejectBlockMask == 0) {
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
continue;
}
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// If the trivial accept corner of all edges is positive (inside the edge) then the triangle
// fully covers the block.
const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0);
const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1);
const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2);
const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept);
uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept);
int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4];
vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]);
vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]);
vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]);
for (uint32_t iBlock = 0; trivialRejectBlockMask != 0;
++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) {
if ((trivialRejectBlockMask & 1) == 0) {
continue;
}
uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4];
vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123);
vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123);
if ((trivialAcceptBlockMask & 1) != 0) {
// Partial block
vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123);
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);
// Calculate the (inverse) pixel mask.
// If any of the barycentric coordinates is negative, the pixel mask will
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
// to blend between the existing framebuffer values and the new values.
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);
// Row 0
if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]);
}
// Row 1
if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
}
// Row 2
if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
}
// Row 3
if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
}
} else {
// Full block
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif
// Row 0
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
}
// Row 1
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
}
// Row 2
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
}
// Row 3
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
}
}
}
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
}
v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy);
v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy);
v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy);
}
}
static void swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
const vec4f v_r2 = vec4f_getXXXX(v_c2);
const vec4f v_g2 = vec4f_getYYYY(v_c2);
const vec4f v_b2 = vec4f_getZZZZ(v_c2);
const vec4f v_a2 = vec4f_getWWWW(v_c2);
const vec4f v_dr02 = vec4f_getXXXX(v_c02);
const vec4f v_dg02 = vec4f_getYYYY(v_c02);
const vec4f v_db02 = vec4f_getZZZZ(v_c02);
const vec4f v_da02 = vec4f_getWWWW(v_c02);
const vec4f v_dr12 = vec4f_getXXXX(v_c12);
const vec4f v_dg12 = vec4f_getYYYY(v_c12);
const vec4f v_db12 = vec4f_getZZZZ(v_c12);
const vec4f v_da12 = vec4f_getWWWW(v_c12);
// Barycentric coordinate normalization
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Trivial reject/accept corner offsets relative to block min/max.
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0);
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0);
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1);
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1);
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1);
const vec4i v_zero = vec4i_zero();
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero);
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero);
const vec4i v_trivialRejectOffset = vec4i_add(
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx),
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy)
);
const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset);
const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset);
const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset);
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset);
const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset);
const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset);
const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset);
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3);
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets);
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets);
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets);
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy);
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy);
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy);
// Rasterize
const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned));
const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16);
const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16);
const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16);
const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4);
const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4);
const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4);
const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12);
vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets));
vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets));
vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets));
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) {
uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width];
vec4i v_w0_blockMin = v_w0_blockY;
vec4i v_w1_blockMin = v_w1_blockY;
vec4i v_w2_blockMin = v_w2_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0);
const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1);
const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2);
const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject);
uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F;
if (trivialRejectBlockMask == 0) {
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
continue;
}
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// If the trivial accept corner of all edges is positive (inside the edge) then the triangle
// fully covers the block.
const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0);
const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1);
const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2);
const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept);
uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept);
int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4];
vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]);
vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]);
vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]);
for (uint32_t iBlock = 0; trivialRejectBlockMask != 0;
++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) {
if ((trivialRejectBlockMask & 1) == 0) {
continue;
}
uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4];
vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123);
vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123);
if ((trivialAcceptBlockMask & 1) != 0) {
// Partial block
vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123);
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy);
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy);
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy);
// Calculate the (inverse) pixel mask.
// If any of the barycentric coordinates is negative, the pixel mask will
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop
// to blend between the existing framebuffer values and the new values.
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0);
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1);
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2);
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3);
// Row 0
if (!vec4i_all_neg_SSE2(v_w_row0_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[0]);
}
// Row 1
if (!vec4i_all_neg_SSE2(v_w_row1_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]);
}
// Row 2
if (!vec4i_all_neg_SSE2(v_w_row2_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]);
}
// Row 3
if (!vec4i_all_neg_SSE2(v_w_row3_or)) {
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31);
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8 = vec4i_fromInt(-1);
#else
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2)));
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2)));
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2)));
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2)));
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca);
#endif
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]);
}
} else {
// Full block
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy);
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy);
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy);
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy);
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy);
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy);
#endif
// Row 0
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row0 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area);
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area);
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2)));
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2)));
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2)));
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2)));
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0);
#endif
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]);
}
// Row 1
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row1 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area);
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area);
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2)));
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2)));
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2)));
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2)));
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1);
#endif
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]);
}
// Row 2
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row2 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area);
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area);
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2)));
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2)));
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2)));
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2)));
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2);
#endif
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]);
}
// Row 3
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const vec4i v_rgba8_row3 = vec4i_fromInt(-1);
#else
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area);
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area);
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2)));
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2)));
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2)));
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2)));
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3);
#endif
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]);
}
}
}
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx);
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx);
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx);
}
v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy);
v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy);
v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy);
}
}
// 2-level hierarchical rasterization using trivial reject/accept corners.
//
// Similar to swrDrawTriangleRef_HierarchicalLRB_Cond() but for each partially covered block
// the range of valid rows is calculated and iterated and for each touched row the range
// of valid pixels/cols is calculated and iterated. This way there is no need for conditionals
// inside the inner-most loop and the 3 barycentric coordinates are always greater than or equal
// to 0.
//
// Hard to vectorize because the valid row/col calculations are scalar.
//
// Even though there are multiple difficult to predict branches inside the inner loops this
// seems to be marginally faster than the reference _NoCond() function.
#if 1
static void swrDrawTriangleRef_HierarchicalLRB_NoCond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize);
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize);
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize);
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize);
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned;
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r;
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g;
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b;
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a;
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r;
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g;
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b;
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a;
#endif
// Triangle setup
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1);
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2);
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0);
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const float inv_area = 1.0f / (float)iarea;
#endif
// Trivial reject/accept corner offsets relative to block min/max.
const int32_t trivialRejectOffset0 = 0
+ (edge0.m_dx >= 0 ? edge0.m_dx * (kBlockSize - 1) : 0)
+ (edge0.m_dy >= 0 ? edge0.m_dy * (kBlockSize - 1) : 0)
;
const int32_t trivialRejectOffset1 = 0
+ (edge1.m_dx >= 0 ? edge1.m_dx * (kBlockSize - 1) : 0)
+ (edge1.m_dy >= 0 ? edge1.m_dy * (kBlockSize - 1) : 0)
;
const int32_t trivialRejectOffset2 = 0
+ (edge2.m_dx >= 0 ? edge2.m_dx * (kBlockSize - 1) : 0)
+ (edge2.m_dy >= 0 ? edge2.m_dy * (kBlockSize - 1) : 0)
;
const int32_t trivialAcceptOffset0 = (edge0.m_dx + edge0.m_dy) * (kBlockSize - 1) - trivialRejectOffset0;
const int32_t trivialAcceptOffset1 = (edge1.m_dx + edge1.m_dy) * (kBlockSize - 1) - trivialRejectOffset1;
const int32_t trivialAcceptOffset2 = (edge2.m_dx + edge2.m_dy) * (kBlockSize - 1) - trivialRejectOffset2;
const int32_t trivialRejectOffset0_dx = 0
+ (edge0.m_dx >= 0 ? edge0.m_dx * (kBlockSize - 1) : 0)
;
const int32_t trivialRejectOffset1_dx = 0
+ (edge1.m_dx >= 0 ? edge1.m_dx * (kBlockSize - 1) : 0)
;
const int32_t trivialRejectOffset2_dx = 0
+ (edge2.m_dx >= 0 ? edge2.m_dx * (kBlockSize - 1) : 0)
;
// Rasterize
const int32_t w0_bboxMin = swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned);
const int32_t w1_bboxMin = swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned);
const int32_t w2_bboxMin = swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned);
const int32_t w0_block_dx = edge0.m_dx * kBlockSize;
const int32_t w0_block_dy = edge0.m_dy * kBlockSize;
const int32_t w1_block_dx = edge1.m_dx * kBlockSize;
const int32_t w1_block_dy = edge1.m_dy * kBlockSize;
const int32_t w2_block_dx = edge2.m_dx * kBlockSize;
const int32_t w2_block_dy = edge2.m_dy * kBlockSize;
int32_t w0_blockY = w0_bboxMin;
int32_t w1_blockY = w1_bboxMin;
int32_t w2_blockY = w2_bboxMin;
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) {
int32_t w0_blockMin = w0_blockY;
int32_t w1_blockMin = w1_blockY;
int32_t w2_blockMin = w2_blockY;
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) {
// Evaluate each edge function at its trivial reject corner (the most positive block corner).
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle
// does not touch the block.
const int32_t w0_trivialReject = w0_blockMin + trivialRejectOffset0;
const int32_t w1_trivialReject = w1_blockMin + trivialRejectOffset1;
const int32_t w2_trivialReject = w2_blockMin + trivialRejectOffset2;
if (SWR_ANY_NEGATIVE3(w0_trivialReject, w1_trivialReject, w2_trivialReject)) {
w0_blockMin += w0_block_dx;
w1_blockMin += w1_block_dx;
w2_blockMin += w2_block_dx;
continue;
}
// At this point we know that the triangle touches the tile. There are 2 cases:
// - The tile is fully covered by the triangle.
// - The tile is partially covered by the triangle.
//
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path).
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row.
//
// Evaluate each edge function at its trivial accept corner (the most negative block corner).
// The trivial accept corner is the opposite corner to the trivial reject corner.
// If all trivial accept corners are inside their respective edges then the block is fully
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case).
//
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from
// the block's max point.
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite
// (trivial accept) corner:
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY
//
const int32_t w0_trivialAccept = w0_blockMin + trivialAcceptOffset0;
const int32_t w1_trivialAccept = w1_blockMin + trivialAcceptOffset1;
const int32_t w2_trivialAccept = w2_blockMin + trivialAcceptOffset2;
if (SWR_ANY_NEGATIVE3(w0_trivialAccept, w1_trivialAccept, w2_trivialAccept)) {
// Partial block
int32_t pymin = 0;
int32_t pymax = kBlockSize - 1;
if (w0_trivialAccept < 0) {
// Evaluate 1st edge function at the 4 block corners. If all of the signed
// distances are negative (all sign bits are 1) then the block will be empty.
const int32_t w0_A = w0_blockMin;
const int32_t w0_B = w0_blockMin + w0_block_dx;
const int32_t w0_C = w0_blockMin + w0_block_dx + w0_block_dy;
const int32_t w0_D = w0_blockMin + w0_block_dy;
const uint32_t w0_blockMsk = SWR_BLOCK_MASK(w0_A, w0_B, w0_C, w0_D);
assert(SWR_BLOCK_MASK_IS_VALID(w0_blockMsk));
assert(w0_blockMsk != SWR_BLOCK_MASK_EMPTY);
if (SWR_BLOCK_MASK_Y_MAX(w0_blockMsk)) {
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
pymax = swr_mini(pymax, w_pymax);
} else if (SWR_BLOCK_MASK_Y_MIN(w0_blockMsk)) {
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy));
pymin = swr_maxi(pymin, w_pymin);
}
}
if (w1_trivialAccept < 0) {
// Evaluate 2nd edge function at the 4 block corners. If all of the signed
// distances are negative (all sign bits are 1) then the block will be empty.
const int32_t w1_A = w1_blockMin;
const int32_t w1_B = w1_blockMin + w1_block_dx;
const int32_t w1_C = w1_blockMin + w1_block_dx + w1_block_dy;
const int32_t w1_D = w1_blockMin + w1_block_dy;
const uint32_t w1_blockMsk = SWR_BLOCK_MASK(w1_A, w1_B, w1_C, w1_D);
assert(SWR_BLOCK_MASK_IS_VALID(w1_blockMsk));
assert(w1_blockMsk != SWR_BLOCK_MASK_EMPTY);
if (SWR_BLOCK_MASK_Y_MAX(w1_blockMsk)) {
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
pymax = swr_mini(pymax, w_pymax);
} else if (SWR_BLOCK_MASK_Y_MIN(w1_blockMsk)) {
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy));
pymin = swr_maxi(pymin, w_pymin);
}
}
if (w2_trivialAccept < 0) {
// Evaluate 3rd edge function at the 4 block corners. If all of the signed
// distances are negative (all sign bits are 1) then the block will be empty.
const int32_t w2_A = w2_blockMin;
const int32_t w2_B = w2_blockMin + w2_block_dx;
const int32_t w2_C = w2_blockMin + w2_block_dx + w2_block_dy;
const int32_t w2_D = w2_blockMin + w2_block_dy;
const uint32_t w2_blockMsk = SWR_BLOCK_MASK(w2_A, w2_B, w2_C, w2_D);
assert(SWR_BLOCK_MASK_IS_VALID(w2_blockMsk));
assert(w2_blockMsk != SWR_BLOCK_MASK_EMPTY);
if (SWR_BLOCK_MASK_Y_MAX(w2_blockMsk)) {
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
pymax = swr_mini(pymax, w_pymax);
} else if (SWR_BLOCK_MASK_Y_MIN(w2_blockMsk)) {
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy));
pymin = swr_maxi(pymin, w_pymin);
}
}
// Evaluate edge functions at the first row.
int32_t w0_blockMinX_py = w0_blockMin + edge0.m_dy * pymin;
int32_t w1_blockMinX_py = w1_blockMin + edge1.m_dy * pymin;
int32_t w2_blockMinX_py = w2_blockMin + edge2.m_dy * pymin;
for (int32_t py = pymin; py <= pymax; ++py) {
const int32_t w0_rowMsk = SWR_ROW_MASK(w0_blockMinX_py, w0_blockMinX_py + w0_block_dx);
const int32_t w1_rowMsk = SWR_ROW_MASK(w1_blockMinX_py, w1_blockMinX_py + w1_block_dx);
const int32_t w2_rowMsk = SWR_ROW_MASK(w2_blockMinX_py, w2_blockMinX_py + w2_block_dx);
assert(w0_rowMsk != SWR_ROW_MASK_EMPTY);
assert(w1_rowMsk != SWR_ROW_MASK_EMPTY);
assert(w2_rowMsk != SWR_ROW_MASK_EMPTY);
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + (blockMinY + py) * ctx->m_Width];
int32_t pxmin = 0;
int32_t pxmax = (int32_t)kBlockSize - 1;
if (!SWR_ROW_MASK_ALL_FULL(w0_rowMsk, w1_rowMsk, w2_rowMsk)) {
if (SWR_ROW_MASK_X_MAX(w0_rowMsk)) {
const int32_t w_pxmax = swr_idiv_floor(w0_blockMinX_py, -edge0.m_dx);
pxmax = swr_mini(pxmax, w_pxmax);
} else if (SWR_ROW_MASK_X_MIN(w0_rowMsk)) {
const int32_t w_pxmin = swr_idiv_ceil(-w0_blockMinX_py, edge0.m_dx);
pxmin = swr_maxi(pxmin, w_pxmin);
}
if (SWR_ROW_MASK_X_MAX(w1_rowMsk)) {
const int32_t w_pxmax = swr_idiv_floor(w1_blockMinX_py, -edge1.m_dx);
pxmax = swr_mini(pxmax, w_pxmax);
} else if (SWR_ROW_MASK_X_MIN(w1_rowMsk)) {
const int32_t w_pxmin = swr_idiv_ceil(-w1_blockMinX_py, edge1.m_dx);
pxmin = swr_maxi(pxmin, w_pxmin);
}
if (SWR_ROW_MASK_X_MAX(w2_rowMsk)) {
const int32_t w_pxmax = swr_idiv_floor(w2_blockMinX_py, -edge2.m_dx);
pxmax = swr_mini(pxmax, w_pxmax);
} else if (SWR_ROW_MASK_X_MIN(w2_rowMsk)) {
const int32_t w_pxmin = swr_idiv_ceil(-w2_blockMinX_py, edge2.m_dx);
pxmin = swr_maxi(pxmin, w_pxmin);
}
}
// Calculate barycentric coords at pxmin
int32_t w0 = w0_blockMinX_py + pxmin * edge0.m_dx;
int32_t w1 = w1_blockMinX_py + pxmin * edge1.m_dx;
int32_t w2 = w2_blockMinX_py + pxmin * edge2.m_dx;
for (int32_t px = pxmin; px <= pxmax; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
w0_blockMinX_py += edge0.m_dy;
w1_blockMinX_py += edge1.m_dy;
w2_blockMinX_py += edge2.m_dy;
}
} else {
// Full block
int32_t w0_row = w0_blockMin;
int32_t w1_row = w1_blockMin;
int32_t w2_row = w2_blockMin;
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width];
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) {
// Calculate barycentric coords at pxmin
int32_t w0 = w0_row;
int32_t w1 = w1_row;
int32_t w2 = w2_row;
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
assert(w0 >= 0 && w1 >= 0 && w2 >= 0);
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
const float l0 = (float)w0 * inv_area;
const float l1 = (float)w1 * inv_area;
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r);
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g);
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b);
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a);
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca);
#endif
fb_row[px] = rgba;
}
w0 += edge0.m_dx;
w1 += edge1.m_dx;
w2 += edge2.m_dx;
}
w0_row += edge0.m_dy;
w1_row += edge1.m_dy;
w2_row += edge2.m_dy;
fb_row += ctx->m_Width;
}
}
w0_blockMin += w0_block_dx;
w1_blockMin += w1_block_dx;
w2_blockMin += w2_block_dx;
}
w0_blockY += w0_block_dy;
w1_blockY += w1_block_dy;
w2_blockY += w2_block_dy;
}
}
#endif
// Initial SSE2 implementation based on swrDrawTriangle_Ref()
#if 1
static void swrDrawTriangleSSE2_Ref(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW.
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
// Compute triangle bounding box
const int32_t minX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t minY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t maxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1));
const int32_t maxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1));
const int32_t bboxWidth = maxX - minX;
const int32_t bboxHeight = maxY - minY;
// Prepare interpolated attributes
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_c0 = vec4f_fromRGBA8(color0);
const vec4f v_c1 = vec4f_fromRGBA8(color1);
const vec4f v_c2 = vec4f_fromRGBA8(color2);
const vec4f v_c02 = vec4f_sub(v_c0, v_c2);
const vec4f v_c12 = vec4f_sub(v_c1, v_c2);
#endif
// Triangle setup
const vec4i v_w_px = vec4i_fromInt4(y1 - y2, y2 - y0, y0 - y1, 0);
const vec4i v_w_py = vec4i_fromInt4(x2 - x1, x0 - x2, x1 - x0, 0);
const vec4i v_w_c = vec4i_fromInt4(x1 * y2 - y1 * x2, x2 * y0 - y2 * x0, x0 * y1 - y0 * x1, 0);
const vec4i v_minX = vec4i_fromInt(minX);
const vec4i v_minY = vec4i_fromInt(minY);
const vec4i v_w_pmin = vec4i_add(v_w_c, vec4i_add(vec4i_mullo_SSE2(v_w_px, v_minX), vec4i_mullo_SSE2(v_w_py, v_minY)));
// Barycentric coordinate normalization
#if !SWR_CONFIG_NO_PIXEL_SHADER
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
#endif
// Rasterize
vec4i v_w_row = v_w_pmin;
uint32_t* fb_row = &ctx->m_FrameBuffer[minX + minY * ctx->m_Width];
for (int32_t py = 0; py <= bboxHeight; ++py) {
int32_t pxmin = 0;
int32_t pxmax = bboxWidth;
// Calculate the range of x values for which the barycentric coordinates
// will always be greater than or equal to 0.
{
int32_t w_row[4];
vec4i_toInt4vu(v_w_row, &w_row[0]);
int32_t w_px[4];
vec4i_toInt4vu(v_w_px, &w_px[0]);
// The barycentric coordinates are linear functions: w_pmin + i * w_px
//
// The inequality w_pmin + i * w_px >= 0 holds for all i's in the range:
// 1. w_pmin >= 0 && w_px >= 0 : [0, bboxWidth]
// 2. w_pmin >= 0 && w_px < 0 : [0, imax] where imax = -(w_pmin / w_px)
// 3. w_pmin < 0 && w_px > 0 : [imin, bboxWidth] where imin = -(w_pmin / w_px) + 1
// 4. w_pmin < 0 && w_px <= 0 : never
//
// From the 3 barycentric coordinates we have 3 equations. All of them
// should be greater than or equal to 0 to draw a pixel.
// Make sure we aren't in an invalid state.
assert(!(w_row[0] < 0 && w_px[0] <= 0));
assert(!(w_row[1] < 0 && w_px[1] <= 0));
assert(!(w_row[2] < 0 && w_px[2] <= 0));
// Calculate x range based on w0...
if (w_row[0] >= 0 && w_px[0] < 0) {
pxmax = swr_mini(pxmax, -(w_row[0] / w_px[0]));
} else if (w_row[0] < 0 && w_px[0] > 0) {
pxmin = swr_maxi(pxmin, -(w_row[0] / w_px[0]) + 1);
}
// Calculate x range based on w1...
if (w_row[1] >= 0 && w_px[1] < 0) {
pxmax = swr_mini(pxmax, -(w_row[1] / w_px[1]));
} else if (w_row[1] < 0 && w_px[1] > 0) {
pxmin = swr_maxi(pxmin, -(w_row[1] / w_px[1]) + 1);
}
// Calculate x range based on w2...
if (w_row[2] >= 0 && w_px[2] < 0) {
pxmax = swr_mini(pxmax, -(w_row[2] / w_px[2]));
} else if (w_row[2] < 0 && w_px[2] > 0) {
pxmin = swr_maxi(pxmin, -(w_row[2] / w_px[2]) + 1);
}
}
// Calculate barycentric coords at pxmin
const vec4i v_pxmin = vec4i_fromInt(pxmin);
vec4i v_w = vec4i_add(v_w_row, vec4i_mullo_SSE2(v_w_px, v_pxmin));
for (int32_t px = pxmin; px <= pxmax; ++px) {
// (px, py) is guaranteed to be inside the triangle (or on one of the edges)
// Render the pixel
{
#if SWR_CONFIG_NO_PIXEL_SHADER
const uint32_t rgba = 0xFFFFFFFF;
#else
int32_t w[4];
vec4i_toInt4vu(v_w, &w[0]);
assert(w[0] >= 0 && w[1] >= 0 && w[2] >= 0);
const vec4f v_l = vec4f_mul(vec4f_fromVec4i(v_w), v_inv_area);
const vec4f v_l0 = vec4f_getXXXX(v_l);
const vec4f v_l1 = vec4f_getYYYY(v_l);
// l2 = 1.0f - (l0 + l1)
//
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=>
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=>
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=>
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=>
//
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2));
const vec4f v_c = vec4f_madd_SSE2(v_c02, v_l0, vec4f_madd_SSE2(v_c12, v_l1, v_c2));
const uint32_t rgba = vec4f_toRGBA8(v_c);
#endif
fb_row[px] = rgba;
}
v_w = vec4i_add(v_w, v_w_px);
}
v_w_row = vec4i_add(v_w_row, v_w_py);
fb_row += ctx->m_Width;
}
}
#endif
#if 0
// Old implementations (see Triangle Rasterizations posts)
//////////////////////////////////////////////////////////////////////////
// SSE2 implementation
//
#define USE_VEC4_LIB 0
// http://dss.stephanierct.com/DevBlog/?p=8
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
static inline __m128 _mm_floor_ps_SSE2(__m128 x)
{
__m128 j = _mm_load_ps(&xmm_ones[0]);
__m128i i = _mm_cvttps_epi32(x);
__m128 fi = _mm_cvtepi32_ps(i);
__m128 igx = _mm_cmpgt_ps(fi, x);
j = _mm_and_ps(igx, j);
return _mm_sub_ps(fi, j);
}
static inline __m128 _mm_ceil_ps_SSE2(__m128 x)
{
__m128 j = _mm_load_ps(&xmm_ones[0]);
__m128i i = _mm_cvttps_epi32(x);
__m128 fi = _mm_cvtepi32_ps(i);
__m128 igx = _mm_cmplt_ps(fi, x);
j = _mm_and_ps(igx, j);
return _mm_add_ps(fi, j);
}
static inline __m128i _mm_mullo_epi32_SSE2(__m128i a, __m128i b)
{
__m128i tmp1 = _mm_mul_epu32(a, b);
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
}
#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8))
static void swrDrawTriangleSSE2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
#if USE_VEC4_LIB
const vec4f v4f_rgba0 = vec4f_fromRGBA8(color0);
const vec4f v4f_rgba1 = vec4f_fromRGBA8(color1);
const vec4f v4f_rgba2 = vec4f_fromRGBA8(color2);
const vec4f v4f_drgba20 = vec4f_sub(v4f_rgba2, v4f_rgba0);
const vec4f v4f_drgba10 = vec4f_sub(v4f_rgba1, v4f_rgba0);
const vec4f v4f_r0 = vec4f_getXXXX(v4f_rgba0);
const vec4f v4f_g0 = vec4f_getYYYY(v4f_rgba0);
const vec4f v4f_b0 = vec4f_getZZZZ(v4f_rgba0);
const vec4f v4f_a0 = vec4f_getWWWW(v4f_rgba0);
const vec4f v4f_dr20 = vec4f_getXXXX(v4f_drgba20);
const vec4f v4f_dg20 = vec4f_getYYYY(v4f_drgba20);
const vec4f v4f_db20 = vec4f_getZZZZ(v4f_drgba20);
const vec4f v4f_da20 = vec4f_getWWWW(v4f_drgba20);
const vec4f v4f_dr10 = vec4f_getXXXX(v4f_drgba10);
const vec4f v4f_dg10 = vec4f_getYYYY(v4f_drgba10);
const vec4f v4f_db10 = vec4f_getZZZZ(v4f_drgba10);
const vec4f v4f_da10 = vec4f_getWWWW(v4f_drgba10);
const vec4f v4f_inv_area = vec4f_fromFloat(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const vec4i v4i_x_duvw_ = vec4i_fromInt4(-dy01, -dy20, dy01_dy20, 0);
const vec4f v4f_x_duvw_1 = vec4f_mul(vec4f_fromVec4i(v4i_x_duvw_), v4f_inv_area);
const vec4f v4f_x_duvw_2 = vec4f_add(v4f_x_duvw_1, v4f_x_duvw_1);
const vec4f v4f_x_duvw_3 = vec4f_add(v4f_x_duvw_1, v4f_x_duvw_2);
const vec4f v4f_x_duvw_4 = vec4f_add(v4f_x_duvw_2, v4f_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const vec4f v4f_x_duv0_duv1 = vec4f_shuffle(vec4f_zero(), v4f_x_duvw_1, VEC4F_SHUFFLE_XYXY);
// UV deltas for the 3rd and 4th pixel
const vec4f v4f_x_duv2_duv3 = vec4f_shuffle(v4f_x_duvw_2, v4f_x_duvw_3, VEC4F_SHUFFLE_XYXY);
const vec4f v4f_x_du4 = vec4f_getXXXX(v4f_x_duvw_4);
const vec4f v4f_x_dv4 = vec4f_getYYYY(v4f_x_duvw_4);
// Barycentric coordinate deltas for the Y direction
const vec4i v4i_y_duvw_ = vec4i_fromInt4(dx01, dx20, -(dx01 + dx20), 0);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
vec4i v4i_row_uvw_ = vec4i_fromInt4(bboxMin_u, bboxMin_v, bboxMin_w, 0);
//
const vec4f v4f_row_uvw_scale = vec4f_fromFloat4(1.0f / (float)dy01, 1.0f / (float)dy20, 1.0f / (float)dy01_dy20, 0.0f);
#else
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
#endif
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
#if USE_VEC4_LIB
vec4i_toInt4vu(v4i_row_uvw_, &row_uvw_[0]);
const vec4f v4f_row_uvw_ = vec4f_mul(vec4f_fromVec4i(v4i_row_uvw_), v4f_row_uvw_scale);
const vec4i v4i_row_uvw_floor = vec4i_fromVec4f(vec4f_floor_SSE2(v4f_row_uvw_));
const vec4i v4i_row_uvw_ceil = vec4i_fromVec4f(vec4f_ceil_SSE2(v4f_row_uvw_));
int32_t row_uvw_floor[4];
vec4i_toInt4vu(v4i_row_uvw_floor, &row_uvw_floor[0]);
int32_t row_uvw_ceil[4];
vec4i_toInt4vu(v4i_row_uvw_ceil, &row_uvw_ceil[0]);
#else
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
#endif
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
#if USE_VEC4_LIB
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const vec4i v4i_p0uvw_ = vec4i_add(v4i_row_uvw_, vec4i_mullo_SSE2(vec4i_fromInt(ixmin), v4i_x_duvw_));
const vec4f v4f_p0uvw_ = vec4f_mul(vec4f_fromVec4i(v4i_p0uvw_), v4f_inv_area);
const vec4f v4f_p0uvuv = vec4f_getXYXY(v4f_p0uvw_);
// Calculate barycentric coordinates for the 4 pixels.
const vec4f v4f_p0uv_p1uv = vec4f_add(v4f_p0uvuv, v4f_x_duv0_duv1);
const vec4f v4f_p2uv_p3uv = vec4f_add(v4f_p0uvuv, v4f_x_duv2_duv3);
// Extract barycentric coordinates for each pixel
vec4f v4f_u0123 = vec4f_shuffle(v4f_p0uv_p1uv, v4f_p2uv_p3uv, VEC4F_SHUFFLE_XZXZ);
vec4f v4f_v0123 = vec4f_shuffle(v4f_p0uv_p1uv, v4f_p2uv_p3uv, VEC4F_SHUFFLE_YWYW);
#else
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
#endif
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
#if USE_VEC4_LIB
const vec4f v4f_r_p0123 = vec4f_add(v4f_r0, vec4f_add(vec4f_mul(v4f_dr10, v4f_v0123), vec4f_mul(v4f_dr20, v4f_u0123)));
const vec4f v4f_g_p0123 = vec4f_add(v4f_g0, vec4f_add(vec4f_mul(v4f_dg10, v4f_v0123), vec4f_mul(v4f_dg20, v4f_u0123)));
const vec4f v4f_b_p0123 = vec4f_add(v4f_b0, vec4f_add(vec4f_mul(v4f_db10, v4f_v0123), vec4f_mul(v4f_db20, v4f_u0123)));
const vec4f v4f_a_p0123 = vec4f_add(v4f_a0, vec4f_add(vec4f_mul(v4f_da10, v4f_v0123), vec4f_mul(v4f_da20, v4f_u0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(v4f_r_p0123.m_XMM), _mm_cvtps_epi32(v4f_g_p0123.m_XMM)),
_mm_packs_epi32(_mm_cvtps_epi32(v4f_b_p0123.m_XMM), _mm_cvtps_epi32(v4f_a_p0123.m_XMM))
);
#else
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
#endif
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
// _mm_shuffle_epi8() with SSE2
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
);
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
const __m128i imm_rgba_p0123_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
#if USE_VEC4_LIB
v4f_u0123 = vec4f_add(v4f_u0123, v4f_x_du4);
v4f_v0123 = vec4f_add(v4f_u0123, v4f_x_dv4);
#else
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
#endif
frameBuffer += 4;
}
// Calculate the colors of the 4 next pixels and selectively store only the number
// of remainder pixels for this row
const uint32_t rem = numPixels & 3;
{
#if USE_VEC4_LIB
const vec4f v4f_r_p0123 = vec4f_madd_SSE2(v4f_dr10, v4f_v0123, vec4f_madd_SSE2(v4f_dr20, v4f_u0123, v4f_r0));
const vec4f v4f_g_p0123 = vec4f_madd_SSE2(v4f_dg10, v4f_v0123, vec4f_madd_SSE2(v4f_dg20, v4f_u0123, v4f_g0));
const vec4f v4f_b_p0123 = vec4f_madd_SSE2(v4f_db10, v4f_v0123, vec4f_madd_SSE2(v4f_db20, v4f_u0123, v4f_b0));
const vec4f v4f_a_p0123 = vec4f_madd_SSE2(v4f_da10, v4f_v0123, vec4f_madd_SSE2(v4f_da20, v4f_u0123, v4f_a0));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(v4f_r_p0123.m_XMM), _mm_cvtps_epi32(v4f_g_p0123.m_XMM)),
_mm_packs_epi32(_mm_cvtps_epi32(v4f_b_p0123.m_XMM), _mm_cvtps_epi32(v4f_a_p0123.m_XMM))
);
#else
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
#endif
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
// _mm_shuffle_epi8() with SSE2
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
);
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
const __m128i imm_rgba_p0123_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
);
switch (rem) {
case 1:
_mm_storeu_si32(frameBuffer, imm_rgba_p0123_u8);
break;
case 2:
_mm_storeu_si64(frameBuffer, imm_rgba_p0123_u8);
break;
case 3:
_mm_storeu_si64(frameBuffer, imm_rgba_p0123_u8);
_mm_storeu_si32(&frameBuffer[2], _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(2, 2, 2, 2)));
break;
case 0:
default:
break;
}
}
}
#if USE_VEC4_LIB
// Move on to the next row of pixels.
v4i_row_uvw_ = vec4i_add(v4i_row_uvw_, v4i_y_duvw_);
#else
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
#endif
framebufferRow += ctx->m_Width;
}
}
static void swrDrawTriangleSSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// Shuffle into RGBA uint32_t
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
frameBuffer += 4;
}
// Calculate the colors of the 4 next pixels and selectively store only the number
// of remainder pixels for this row
const uint32_t rem = numPixels & 3;
if (rem != 0) {
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// Shuffle into RGBA uint32_t
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
// Store
_mm_storeu_si32(frameBuffer, imm_rgba_p0123_u8);
frameBuffer++;
if (rem == 2) {
_mm_storeu_si32(frameBuffer, _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(1, 1, 1, 1)));
} else if (rem == 3) {
_mm_storeu_si64(frameBuffer, _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(2, 1, 2, 1)));
}
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
static void swrDrawTriangleSSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// Shuffle into RGBA uint32_t
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
frameBuffer += 4;
}
// Calculate the colors of the 4 next pixels and selectively store only the number
// of remainder pixels for this row
const uint32_t rem = numPixels & 3;
{
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// Shuffle into RGBA uint32_t
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
// Load existing frame buffer values.
const __m128i imm_frameBuffer = _mm_lddqu_si128((const __m128i*)frameBuffer);
// Replace only the number of remainder pixels
const __m128 blendMask = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_set_epi32(rem, rem, rem, rem), _mm_set_epi32(3, 2, 1, 0)));
const __m128i xmm_newFrameBuffer = _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(imm_frameBuffer), _mm_castsi128_ps(imm_rgba_p0123_u8), blendMask));
// Store
_mm_storeu_si128((__m128i*)frameBuffer, xmm_newFrameBuffer);
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
#endif
#ifndef SWR_SWR_MATH_H
#define SWR_SWR_MATH_H
#include <stdint.h>
#include <stdbool.h>
#include <immintrin.h>
typedef struct vec4f
{
__m128 m_XMM;
} vec4f;
typedef struct vec4i
{
__m128i m_IMM;
} vec4i;
static inline vec4f vec4f_zero(void)
{
return (vec4f){ .m_XMM = _mm_setzero_ps() };
}
static inline vec4f vec4f_fromFloat(float x)
{
return (vec4f){ .m_XMM = _mm_set_ps1(x) };
}
static inline vec4f vec4f_fromVec4i(vec4i x)
{
return (vec4f){ .m_XMM = _mm_cvtepi32_ps(x.m_IMM) };
}
static inline vec4f vec4f_fromFloat4(float x0, float x1, float x2, float x3)
{
return (vec4f){ .m_XMM = _mm_set_ps(x3, x2, x1, x0) };
}
static inline vec4f vec4f_fromRGBA8(uint32_t rgba8)
{
const __m128i imm_zero = _mm_setzero_si128();
const __m128i imm_rgba8 = _mm_cvtsi32_si128(rgba8);
const __m128i imm_rgba16 = _mm_unpacklo_epi8(imm_rgba8, imm_zero);
const __m128i imm_rgba32 = _mm_unpacklo_epi16(imm_rgba16, imm_zero);
return (vec4f){
.m_XMM = _mm_cvtepi32_ps(imm_rgba32)
};
}
static inline uint32_t vec4f_toRGBA8(vec4f x)
{
const __m128i imm_zero = _mm_setzero_si128();
const __m128i imm_rgba32 = _mm_cvtps_epi32(x.m_XMM);
const __m128i imm_rgba16 = _mm_packs_epi32(imm_rgba32, imm_zero);
const __m128i imm_rgba8 = _mm_packus_epi16(imm_rgba16, imm_zero);
return (uint32_t)_mm_cvtsi128_si32(imm_rgba8);
}
static inline vec4f vec4f_add(vec4f a, vec4f b)
{
return (vec4f){ .m_XMM = _mm_add_ps(a.m_XMM, b.m_XMM) };
}
static inline vec4f vec4f_sub(vec4f a, vec4f b)
{
return (vec4f){ .m_XMM = _mm_sub_ps(a.m_XMM, b.m_XMM) };
}
static inline vec4f vec4f_mul(vec4f a, vec4f b)
{
return (vec4f){ .m_XMM = _mm_mul_ps(a.m_XMM, b.m_XMM) };
}
#define VEC4_SHUFFLE_MASK(d0_a, d1_a, d2_b, d3_b) (((d3_b) << 6) | ((d2_b) << 4) | ((d1_a) << 2) | ((d0_a)))
typedef enum vec4_shuffle_mask
{
VEC4_SHUFFLE_XXXX = VEC4_SHUFFLE_MASK(0, 0, 0, 0),
VEC4_SHUFFLE_YYYY = VEC4_SHUFFLE_MASK(1, 1, 1, 1),
VEC4_SHUFFLE_ZZZZ = VEC4_SHUFFLE_MASK(2, 2, 2, 2),
VEC4_SHUFFLE_WWWW = VEC4_SHUFFLE_MASK(3, 3, 3, 3),
VEC4_SHUFFLE_XYXY = VEC4_SHUFFLE_MASK(0, 1, 0, 1),
VEC4_SHUFFLE_XZXZ = VEC4_SHUFFLE_MASK(0, 2, 0, 2),
VEC4_SHUFFLE_YWYW = VEC4_SHUFFLE_MASK(1, 3, 1, 3),
VEC4_SHUFFLE_ZWZW = VEC4_SHUFFLE_MASK(2, 3, 2, 3),
} vec4_shuffle_mask;
#define VEC4F_GET_FUNC(swizzle) \
static inline vec4f vec4f_get##swizzle(vec4f x) \
{ \
return (vec4f){ .m_XMM = _mm_shuffle_ps(x.m_XMM, x.m_XMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \
}
VEC4F_GET_FUNC(XXXX);
VEC4F_GET_FUNC(YYYY);
VEC4F_GET_FUNC(ZZZZ);
VEC4F_GET_FUNC(WWWW);
VEC4F_GET_FUNC(XYXY);
VEC4F_GET_FUNC(ZWZW);
// Function-like macro because mask must be an immediate (constant)
#define vec4f_shuffle(a, b, mask) (vec4f){ .m_XMM = _mm_shuffle_ps(a.m_XMM, b.m_XMM, mask) }
// http://dss.stephanierct.com/DevBlog/?p=8
static inline vec4f vec4f_floor_SSE2(vec4f x)
{
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
const __m128i i = _mm_cvttps_epi32(x.m_XMM);
const __m128 fi = _mm_cvtepi32_ps(i);
const __m128 igx = _mm_cmpgt_ps(fi, x.m_XMM);
const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0]));
return (vec4f){ .m_XMM = _mm_sub_ps(fi, j) };
}
// http://dss.stephanierct.com/DevBlog/?p=8
static inline vec4f vec4f_ceil_SSE2(vec4f x)
{
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
const __m128i i = _mm_cvttps_epi32(x.m_XMM);
const __m128 fi = _mm_cvtepi32_ps(i);
const __m128 igx = _mm_cmplt_ps(fi, x.m_XMM);
const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0]));
return (vec4f){ .m_XMM = _mm_add_ps(fi, j) };
}
static inline vec4f vec4f_floor_SSE41(vec4f x)
{
return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_FLOOR) };
}
static inline vec4f vec4f_ceil_SSE41(vec4f x)
{
return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_CEIL) };
}
static inline vec4f vec4f_madd_SSE2(vec4f a, vec4f b, vec4f c)
{
return (vec4f){ .m_XMM = _mm_add_ps(c.m_XMM, _mm_mul_ps(a.m_XMM, b.m_XMM)) };
}
static inline vec4i vec4i_zero(void)
{
return (vec4i){ .m_IMM = _mm_setzero_si128()};
}
static inline vec4i vec4i_one(void)
{
return (vec4i){ .m_IMM = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()) };
}
static inline vec4i vec4i_fromInt(int32_t x)
{
return (vec4i){ .m_IMM = _mm_set1_epi32(x) };
}
static inline vec4i vec4i_fromVec4f(vec4f x)
{
return (vec4i){ .m_IMM = _mm_cvtps_epi32(x.m_XMM) };
}
static inline vec4i vec4i_fromInt4(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
{
return (vec4i){ .m_IMM = _mm_set_epi32(x3, x2, x1, x0) };
}
static inline vec4i vec4i_fromInt4va(const int32_t* arr)
{
return (vec4i){ .m_IMM = _mm_load_si128((const __m128i*)arr) };
}
static inline void vec4i_toInt4vu(vec4i x, int32_t* arr)
{
_mm_storeu_si128((__m128i*)arr, x.m_IMM);
}
static inline void vec4i_toInt4va(vec4i x, int32_t* arr)
{
_mm_store_si128((__m128i*)arr, x.m_IMM);
}
static inline void vec4i_toInt4va_masked(vec4i x, vec4i mask, int32_t* buffer)
{
#if 0
_mm_maskmoveu_si128(x.m_IMM, mask.m_IMM, (char*)buffer);
#else
const __m128i old = _mm_load_si128((const __m128i*)buffer);
const __m128i oldMasked = _mm_andnot_si128(mask.m_IMM, old);
const __m128i newMasked = _mm_and_si128(mask.m_IMM, x.m_IMM);
const __m128i final = _mm_or_si128(oldMasked, newMasked);
_mm_store_si128((__m128i*)buffer, final);
#endif
}
static inline void vec4i_toInt4va_maskedInv_SSE2(vec4i x, vec4i maskInv, int32_t* buffer)
{
#if 0
static const uint32_t ones[] = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
const __m128i imm_ones = _mm_load_si128((const __m128i*)ones);
const __m128i imm_mask = _mm_xor_si128(maskInv.m_IMM, imm_ones);
_mm_maskmoveu_si128(x.m_IMM, imm_mask, (char*)buffer);
#else
const __m128i old = _mm_load_si128((const __m128i*)buffer);
const __m128i oldMasked = _mm_and_si128(maskInv.m_IMM, old);
const __m128i newMasked = _mm_andnot_si128(maskInv.m_IMM, x.m_IMM);
const __m128i final = _mm_or_si128(oldMasked, newMasked);
_mm_store_si128((__m128i*)buffer, final);
#endif
}
static inline void vec4i_toInt4va_maskedInv_SSE41(vec4i x, vec4i maskInv, int32_t* buffer)
{
const __m128i old = _mm_load_si128((const __m128i*)buffer);
const __m128i final = _mm_blendv_epi8(x.m_IMM, old, maskInv.m_IMM);
_mm_store_si128((__m128i*)buffer, final);
}
static inline int32_t vec4i_toInt(vec4i x)
{
return _mm_cvtsi128_si32(x.m_IMM);
}
static inline vec4i vec4i_add(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_add_epi32(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_sub(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_sub_epi32(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_mullo_SSE2(vec4i a, vec4i b)
{
const __m128i tmp1 = _mm_mul_epu32(a.m_IMM, b.m_IMM);
const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a.m_IMM, 4), _mm_srli_si128(b.m_IMM, 4));
return (vec4i){ .m_IMM = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))) };
}
static inline vec4i vec4i_mullo_SSE41(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_mullo_epi32(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_and(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_and_si128(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_or(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_or3(vec4i a, vec4i b, vec4i c)
{
return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, _mm_or_si128(b.m_IMM, c.m_IMM)) };
}
static inline vec4i vec4i_andnot(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_andnot_si128(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_xor(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_xor_si128(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_sar(vec4i x, uint32_t shift)
{
return (vec4i){ .m_IMM = _mm_srai_epi32(x.m_IMM, shift) };
}
static inline vec4i vec4i_sal(vec4i x, uint32_t shift)
{
return (vec4i){ .m_IMM = _mm_slli_epi32(x.m_IMM, shift) };
}
static inline vec4i vec4i_cmplt(vec4i a, vec4i b)
{
return (vec4i){ .m_IMM = _mm_cmplt_epi32(a.m_IMM, b.m_IMM) };
}
static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSE2(vec4i r, vec4i g, vec4i b, vec4i a)
{
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM)
);
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
// _mm_shuffle_epi8() with SSE2
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
);
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
const __m128i imm_rgba_p0123_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
);
return (vec4i){ .m_IMM = imm_rgba_p0123_u8 };
}
static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSSE3(vec4i r, vec4i g, vec4i b, vec4i a)
{
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM)
);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
return (vec4i){ .m_IMM = imm_rgba_p0123_u8 };
}
static inline bool vec4i_any_neg_SSE2(vec4i x)
{
#if 1
return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) != 0;
#else
const __m128i imm_zero = _mm_setzero_si128();
const __m128i imm_cmp = _mm_cmplt_epi32(x.m_IMM, imm_zero);
return _mm_movemask_ps(_mm_castsi128_ps(imm_cmp)) != 0;
#endif
}
static inline bool vec4i_all_neg_SSE2(vec4i x)
{
return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) == 0x0F;
}
static inline uint32_t vec4i_getSignMask(vec4i x)
{
return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM));
}
#define VEC4I_GET_FUNC(swizzle) \
static inline vec4i vec4i_get##swizzle(vec4i x) \
{ \
return (vec4i){ .m_IMM = _mm_shuffle_epi32(x.m_IMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \
}
VEC4I_GET_FUNC(XXXX);
VEC4I_GET_FUNC(YYYY);
VEC4I_GET_FUNC(ZZZZ);
VEC4I_GET_FUNC(WWWW);
VEC4I_GET_FUNC(XYXY);
VEC4I_GET_FUNC(ZWZW);
int32_t swr_absi(int32_t x);
int32_t swr_mini(int32_t a, int32_t b);
int32_t swr_maxi(int32_t a, int32_t b);
int32_t swr_min3i(int32_t a, int32_t b, int32_t c);
int32_t swr_max3i(int32_t a, int32_t b, int32_t c);
int32_t swr_alignDown(int32_t x, uint32_t align);
int32_t swr_alignUp(int32_t x, uint32_t align);
int32_t swr_idiv_floor(int32_t numer, int32_t denom);
int32_t swr_idiv_ceil(int32_t numer, int32_t denom);
#endif
#include "inline/swr_math.inl"
#ifndef SWR_SWR_MATH_H
#error "Must be included from swr_math.h"
#endif
static inline int32_t swr_absi(int32_t x)
{
return x < 0 ? -x : x;
}
static inline int32_t swr_mini(int32_t a, int32_t b)
{
return a < b ? a : b;
}
static inline int32_t swr_maxi(int32_t a, int32_t b)
{
return a > b ? a : b;
}
static inline int32_t swr_min3i(int32_t a, int32_t b, int32_t c)
{
return swr_mini(a, swr_mini(b, c));
}
static inline int32_t swr_max3i(int32_t a, int32_t b, int32_t c)
{
return swr_maxi(a, swr_maxi(b, c));
}
static inline int32_t swr_alignDown(int32_t x, uint32_t align)
{
return (x / align) * align;
}
static inline int32_t swr_alignUp(int32_t x, uint32_t align)
{
return ((x / align) + ((x % align) != 0 ? 1 : 0)) * align;
}
static inline int32_t swr_idiv_floor(int32_t numer, int32_t denom)
{
return numer / denom;
}
static inline int32_t swr_idiv_ceil(int32_t numer, int32_t denom)
{
return (numer / denom) + ((numer % denom) != 0 ? 1 : 0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment