Last active
January 4, 2023 18:38
-
-
Save jdryg/0947bf63db538b4d1e587ffa6fe8b642 to your computer and use it in GitHub Desktop.
Software Renderer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "swr.h" | |
#include "swr_math.h" | |
#include <stdbool.h> | |
#include <malloc.h> | |
#include <memory.h> | |
#include <string.h> | |
#include <assert.h> | |
#include <immintrin.h> | |
#define SWR_CONFIG_NO_PIXEL_SHADER 0 | |
static swr_context* swrCreateContext(uint32_t w, uint32_t h); | |
static void swrDestroyContext(swr_context* ctx); | |
static void swrClear(swr_context* ctx, uint32_t color); | |
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color); | |
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color); | |
static void swrDrawTriangleDispatch(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleRef(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleRef_Hierarchical(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleRef_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleRef_HierarchicalLRB_NoCond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSE2_Ref(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangleSSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color); | |
swr_api* swr = &(swr_api){ | |
.createContext = swrCreateContext, | |
.destroyContext = swrDestroyContext, | |
.clear = swrClear, | |
.drawPixel = swrDrawPixel, | |
.drawLine = swrDrawLine, | |
.drawTriangle = swrDrawTriangleDispatch, | |
.drawText = swrDrawText | |
}; | |
static swr_context* swrCreateContext(uint32_t w, uint32_t h) | |
{ | |
swr_context* ctx = (swr_context*)malloc(sizeof(swr_context)); | |
if (!ctx) { | |
return NULL; | |
} | |
memset(ctx, 0, sizeof(swr_context)); | |
ctx->m_FrameBuffer = (uint32_t*)malloc(sizeof(uint32_t) * (size_t)w * (size_t)h); | |
if (!ctx->m_FrameBuffer) { | |
swrDestroyContext(ctx); | |
return NULL; | |
} | |
memset(ctx->m_FrameBuffer, 0, sizeof(uint32_t) * (size_t)w * (size_t)h); | |
ctx->m_Width = w; | |
ctx->m_Height = h; | |
return ctx; | |
} | |
static void swrDestroyContext(swr_context* ctx) | |
{ | |
free(ctx->m_FrameBuffer); | |
free(ctx); | |
} | |
static void swrClear(swr_context* ctx, uint32_t color) | |
{ | |
uint32_t* buffer = ctx->m_FrameBuffer; | |
const uint32_t numPixels = ctx->m_Width * ctx->m_Height; | |
for (uint32_t i = 0; i < numPixels; ++i) { | |
*buffer++ = color; | |
} | |
} | |
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color) | |
{ | |
if (x < 0 || x >= (int32_t)ctx->m_Width || y < 0 || y >= (int32_t)ctx->m_Height) { | |
return; | |
} | |
ctx->m_FrameBuffer[x + y * ctx->m_Width] = color; | |
} | |
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color) | |
{ | |
bool steep = false; | |
if (swr_absi(x0 - x1) < swr_absi(y0 - y1)) { | |
{ int32_t tmp = x0; x0 = y0; y0 = tmp; } | |
{ int32_t tmp = x1; x1 = y1; y1 = tmp; } | |
steep = true; | |
} | |
if (x0 > x1) { | |
{ int32_t tmp = x0; x0 = x1; x1 = tmp; } | |
{ int32_t tmp = y0; y0 = y1; y1 = tmp; } | |
} | |
const int32_t dx = x1 - x0; | |
const int32_t derror2 = swr_absi(y1 - y0) * 2; | |
const int32_t yinc = y1 > y0 ? 1 : -1; | |
int32_t error2 = 0; | |
int32_t y = y0; | |
if (steep) { | |
for (int32_t x = x0; x <= x1; x++) { | |
swrDrawPixel(ctx, y, x, color); | |
error2 += derror2; | |
if (error2 > dx) { | |
y += yinc; | |
error2 -= dx * 2; | |
} | |
} | |
} else { | |
for (int32_t x = x0; x <= x1; x++) { | |
swrDrawPixel(ctx, x, y, color); | |
error2 += derror2; | |
if (error2 > dx) { | |
y += yinc; | |
error2 -= dx * 2; | |
} | |
} | |
} | |
} | |
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color) | |
{ | |
end = end != NULL | |
? end | |
: str + strlen(str) | |
; | |
const int32_t chw = (int32_t)font->m_CharWidth; | |
const int32_t chh = (int32_t)font->m_CharHeight; | |
const uint8_t* chdata = font->m_CharData; | |
int32_t x = x0; | |
int32_t y = y0; | |
while (str != end) { | |
char ch = *str; | |
if (ch < font->m_CharMin || ch > font->m_CharMax) { | |
ch = font->m_MissingCharFallbackID; | |
} | |
const uint8_t chID = (uint8_t)ch - font->m_CharMin; | |
const uint8_t* charData = &chdata[chID * chh]; | |
for (int32_t chy = 0; chy < chh; ++chy) { | |
const uint8_t chrow = charData[chy]; | |
for (int32_t chx = 0; chx < chw; ++chx) { | |
if ((chrow & (1u << chx)) != 0) { | |
swrDrawPixel(ctx, x + chx, y + chy, color); | |
} | |
} | |
} | |
x += chw; | |
++str; | |
} | |
} | |
static void swrDrawTriangleDispatch(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// TODO: Check CPU caps | |
#if 1 | |
swr->drawTriangle = swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2; | |
#elif 1 | |
swr->drawTriangle = swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2; | |
#elif 1 | |
swr->drawTriangle = swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2; | |
#else | |
swr->drawTriangle = swrDrawTriangleRef_HierarchicalLRB_NoCond; | |
#endif | |
swr->drawTriangle(ctx, x0, y0, x1, y1, x2, y2, color0, color1, color2); | |
} | |
////////////////////////////////////////////////////////////////////////// | |
// swrDrawTriangle() implementations | |
// | |
typedef struct swr_edge | |
{ | |
int32_t m_x0; | |
int32_t m_y0; | |
int32_t m_dx; | |
int32_t m_dy; | |
} swr_edge; | |
static inline swr_edge swr_edgeInit(int32_t x0, int32_t y0, int32_t x1, int32_t y1) | |
{ | |
return (swr_edge){ | |
.m_x0 = x0, | |
.m_y0 = y0, | |
.m_dx = (y1 - y0), | |
.m_dy = (x0 - x1), | |
}; | |
} | |
static inline int32_t swr_edgeEval(swr_edge edge, int32_t x, int32_t y) | |
{ | |
return 0 | |
+ (x - edge.m_x0) * edge.m_dx | |
+ (y - edge.m_y0) * edge.m_dy | |
; | |
} | |
// Reference implementation | |
// https://fgiesen.wordpress.com/2013/02/08/triangle-rasterization-in-practice/ | |
// NOTE: No fill rule used. All pixels lying ON an edge are drawn. | |
static void swrDrawTriangleRef(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t minX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t minY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t maxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t maxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxWidth = maxX - minX; | |
const int32_t bboxHeight = maxY - minY; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r; | |
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g; | |
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b; | |
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a; | |
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r; | |
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g; | |
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b; | |
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a; | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
const int32_t w0_pmin = swr_edgeEval(edge0, minX, minY); | |
const int32_t w1_pmin = swr_edgeEval(edge1, minX, minY); | |
const int32_t w2_pmin = swr_edgeEval(edge2, minX, minY); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const float inv_area = 1.0f / (float)iarea; | |
#endif | |
// Rasterize | |
int32_t w0_row = w0_pmin; | |
int32_t w1_row = w1_pmin; | |
int32_t w2_row = w2_pmin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[minX + minY * ctx->m_Width]; | |
for (int32_t py = 0; py <= bboxHeight; ++py) { | |
int32_t pxmin = 0; | |
int32_t pxmax = bboxWidth; | |
// Calculate the range of x values for which the barycentric coordinates | |
// will always be greater than or equal to 0. | |
{ | |
// The barycentric coordinates are linear functions: w_pmin + i * w_px | |
// | |
// The inequality w_pmin + i * w_px >= 0 holds for all i's in the range: | |
// 1. w_pmin >= 0 && w_px >= 0 : [0, bboxWidth] | |
// 2. w_pmin >= 0 && w_px < 0 : [0, imax] where imax = -(w_pmin / w_px) | |
// 3. w_pmin < 0 && w_px > 0 : [imin, bboxWidth] where imin = -(w_pmin / w_px) + 1 | |
// 4. w_pmin < 0 && w_px <= 0 : never | |
// | |
// From the 3 barycentric coordinates we have 3 equations. All of them | |
// should be greater than or equal to 0 to draw a pixel. | |
// Make sure we aren't in an invalid state. | |
assert(!(w0_row < 0 && edge0.m_dx <= 0)); | |
assert(!(w1_row < 0 && edge1.m_dx <= 0)); | |
assert(!(w2_row < 0 && edge2.m_dx <= 0)); | |
// Calculate x range based on w0... | |
if (w0_row >= 0 && edge0.m_dx < 0) { | |
pxmax = swr_mini(pxmax, -(w0_row / edge0.m_dx)); | |
} else if (w0_row < 0 && edge0.m_dx > 0) { | |
pxmin = swr_maxi(pxmin, (-w0_row / edge0.m_dx) + ((-w0_row % edge0.m_dx) != 0 ? 1 : 0)); | |
} | |
// Calculate x range based on w1... | |
if (w1_row >= 0 && edge1.m_dx < 0) { | |
pxmax = swr_mini(pxmax, -(w1_row / edge1.m_dx)); | |
} else if (w1_row < 0 && edge1.m_dx > 0) { | |
pxmin = swr_maxi(pxmin, (-w1_row / edge1.m_dx) + ((-w1_row % edge1.m_dx) != 0 ? 1 : 0)); | |
} | |
// Calculate x range based on w2... | |
if (w2_row >= 0 && edge2.m_dx < 0) { | |
pxmax = swr_mini(pxmax, -(w2_row / edge2.m_dx)); | |
} else if (w2_row < 0 && edge2.m_dx > 0) { | |
pxmin = swr_maxi(pxmin, (-w2_row / edge2.m_dx) + ((-w2_row % edge2.m_dx) != 0 ? 1 : 0)); | |
} | |
} | |
// Calculate barycentric coords at pxmin | |
int32_t w0 = w0_row + pxmin * edge0.m_dx; | |
int32_t w1 = w1_row + pxmin * edge1.m_dx; | |
int32_t w2 = w2_row + pxmin * edge2.m_dx; | |
for (int32_t px = pxmin; px <= pxmax; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
w0_row += edge0.m_dy; | |
w1_row += edge1.m_dy; | |
w2_row += edge2.m_dy; | |
fb_row += ctx->m_Width; | |
} | |
} | |
static const uint32_t kBlockSize = 4; | |
#define SWR_CONFIG_USE_POSITIVE_MASKS 0 | |
// Example 4x1 row: | |
// *---*---*---*---* | |
// | A | | | B | | |
// *---*---*---*---* | |
// | |
// A = (blockMinX, y) | |
// B = (blockMaxX, y) | |
// | |
// Case | A | B | result | |
// -----|---|---|---------------------------------- | |
// 00 | - | - | not covered | |
// 01 | - | + | partially covered, [xmin, blockMaxX] | |
// 10 | + | - | partially covered, [blockMinX, xmax] | |
// 11 | + | + | fully covered | |
#define SWR_ROW_MASK_A_Pos 1 | |
#define SWR_ROW_MASK_A_Msk (0x01 << SWR_ROW_MASK_A_Pos) | |
#define SWR_ROW_MASK_B_Pos 0 | |
#define SWR_ROW_MASK_B_Msk (0x01 << SWR_ROW_MASK_B_Pos) | |
#if SWR_CONFIG_USE_POSITIVE_MASKS | |
#define SWR_ROW_MASK(wA, wB) (0 \ | |
| ((wA) >= 0 ? SWR_ROW_MASK_A_Msk : 0x00) \ | |
| ((wB) >= 0 ? SWR_ROW_MASK_B_Msk : 0x00) \ | |
) | |
#define SWR_ROW_MASK_EMPTY SWR_ROW_MASK(-1, -1) | |
#define SWR_ROW_MASK_FULL SWR_ROW_MASK(1, 1) | |
#define SWR_ROW_MASK_X_MIN(msk) (((msk) & SWR_ROW_MASK_A_Msk) == 0) | |
#define SWR_ROW_MASK_X_MAX(msk) (((msk) & SWR_ROW_MASK_B_Msk) == 0) | |
#define SWR_ROW_MASK_ANY_EMPTY(msk0, msk1, msk2) ((((msk0) == SWR_ROW_MASK_EMPTY) || ((msk1) == SWR_ROW_MASK_EMPTY) || ((msk2) == SWR_ROW_MASK_EMPTY))) | |
#define SWR_ROW_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) & (msk1) & (msk2)) == SWR_ROW_MASK_FULL) | |
#else | |
#define SWR_ROW_MASK(wA, wB) (0 \ | |
| ((wA) < 0 ? SWR_ROW_MASK_A_Msk : 0x00) \ | |
| ((wB) < 0 ? SWR_ROW_MASK_B_Msk : 0x00) \ | |
) | |
#define SWR_ROW_MASK_EMPTY SWR_ROW_MASK(-1, -1) | |
#define SWR_ROW_MASK_FULL SWR_ROW_MASK(1, 1) | |
#define SWR_ROW_MASK_X_MIN(msk) (((msk) & SWR_ROW_MASK_A_Msk) == SWR_ROW_MASK_A_Msk) | |
#define SWR_ROW_MASK_X_MAX(msk) (((msk) & SWR_ROW_MASK_B_Msk) == SWR_ROW_MASK_B_Msk) | |
#define SWR_ROW_MASK_ANY_EMPTY(msk0, msk1, msk2) ((((msk0) == SWR_ROW_MASK_EMPTY) || ((msk1) == SWR_ROW_MASK_EMPTY) || ((msk2) == SWR_ROW_MASK_EMPTY))) | |
#define SWR_ROW_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) | (msk1) | (msk2)) == SWR_ROW_MASK_FULL) | |
#endif | |
// Example 4x4 block: | |
// | |
// *---*---*---*---* | |
// | A | | | B | | |
// *---*---*---*---* | |
// | | | | | | |
// *---*---*---*---* | |
// | | | | | | |
// *---*---*---*---* | |
// | D | | | C | | |
// *---*---*---*---* | |
// | |
// A = (blockMinX, blockMinY) | |
// B = (blockMaxX, blockMinY) | |
// C = (blockMaxX, blockMaxY) | |
// D = (blockMinX, blockMaxY) | |
// | |
// Case | A | B | C | D | result | |
// -----|---|---|---|---|---------------------------------- | |
// 0000 | - | - | - | - | not covered | |
// 0001 | - | - | - | + | partially covered, [ymin, blockMaxY] | |
// 0010 | - | - | + | - | partially covered, [ymin, blockMaxY] | |
// 0011 | - | - | + | + | partially covered, [ymin, blockMaxY] | |
// 0100 | - | + | - | - | partially covered, [blockMinY, ymax] | |
// 0101 | - | + | - | + | invalid configuration | |
// 0110 | - | + | + | - | partially covered, [blockMinY, blockMaxY] | |
// 0111 | - | + | + | + | partially covered, [blockMinY, blockMaxY] | |
// 1000 | + | - | - | - | partially covered, [blockMinY, ymax] | |
// 1001 | + | - | - | + | partially covered, [blockMinY, blockMaxY] | |
// 1010 | + | - | + | - | invalid configuration | |
// 1011 | + | - | + | + | partially covered, [blockMinY, blockMaxY] | |
// 1100 | + | + | - | - | partially covered, [blockminY, ymax] | |
// 1101 | + | + | - | + | partially covered, [blockMinY, blockMaxY] | |
// 1110 | + | + | + | - | partially covered, [blockMinY, blockMaxY] | |
// 1111 | + | + | + | + | fully covered | |
#define SWR_BLOCK_MASK_A_Pos 3 | |
#define SWR_BLOCK_MASK_A_Msk (0x01 << SWR_BLOCK_MASK_A_Pos) | |
#define SWR_BLOCK_MASK_B_Pos 2 | |
#define SWR_BLOCK_MASK_B_Msk (0x01 << SWR_BLOCK_MASK_B_Pos) | |
#define SWR_BLOCK_MASK_C_Pos 1 | |
#define SWR_BLOCK_MASK_C_Msk (0x01 << SWR_BLOCK_MASK_C_Pos) | |
#define SWR_BLOCK_MASK_D_Pos 0 | |
#define SWR_BLOCK_MASK_D_Msk (0x01 << SWR_BLOCK_MASK_D_Pos) | |
#if SWR_CONFIG_USE_POSITIVE_MASKS | |
#define SWR_BLOCK_MASK(wA, wB, wC, wD) (0 \ | |
| ((wA) >= 0 ? SWR_BLOCK_MASK_A_Msk : 0x00) \ | |
| ((wB) >= 0 ? SWR_BLOCK_MASK_B_Msk : 0x00) \ | |
| ((wC) >= 0 ? SWR_BLOCK_MASK_C_Msk : 0x00) \ | |
| ((wD) >= 0 ? SWR_BLOCK_MASK_D_Msk : 0x00) \ | |
) | |
#define SWR_BLOCK_MASK_EMPTY SWR_BLOCK_MASK(-1, -1, -1, -1) | |
#define SWR_BLOCK_MASK_FULL SWR_BLOCK_MASK(1, 1, 1, 1) | |
#define SWR_BLOCK_MASK_IS_VALID(msk) (((msk) != SWR_BLOCK_MASK(-1, 1, -1, 1)) && ((msk) != SWR_BLOCK_MASK(1, -1, 1, -1))) | |
#define SWR_BLOCK_MASK_Y_MIN(msk) (((msk) & (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) == 0) | |
#define SWR_BLOCK_MASK_Y_MAX(msk) (((msk) & (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) == 0) | |
#define SWR_BLOCK_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) & (msk1) & (msk2)) == SWR_BLOCK_MASK_FULL) | |
#else | |
#define SWR_BLOCK_MASK(wA, wB, wC, wD) (0 \ | |
| ((wA) < 0 ? SWR_BLOCK_MASK_A_Msk : 0x00) \ | |
| ((wB) < 0 ? SWR_BLOCK_MASK_B_Msk : 0x00) \ | |
| ((wC) < 0 ? SWR_BLOCK_MASK_C_Msk : 0x00) \ | |
| ((wD) < 0 ? SWR_BLOCK_MASK_D_Msk : 0x00) \ | |
) | |
#define SWR_BLOCK_MASK_EMPTY SWR_BLOCK_MASK(-1, -1, -1, -1) | |
#define SWR_BLOCK_MASK_FULL SWR_BLOCK_MASK(1, 1, 1, 1) | |
#define SWR_BLOCK_MASK_IS_VALID(msk) (((msk) != SWR_BLOCK_MASK(-1, 1, -1, 1)) && ((msk) != SWR_BLOCK_MASK(1, -1, 1, -1))) | |
#define SWR_BLOCK_MASK_Y_MIN(msk) (((msk) & (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) == (SWR_BLOCK_MASK_A_Msk | SWR_BLOCK_MASK_B_Msk)) | |
#define SWR_BLOCK_MASK_Y_MAX(msk) (((msk) & (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) == (SWR_BLOCK_MASK_C_Msk | SWR_BLOCK_MASK_D_Msk)) | |
#define SWR_BLOCK_MASK_ALL_FULL(msk0, msk1, msk2) (((msk0) | (msk1) | (msk2)) == SWR_BLOCK_MASK_FULL) | |
#endif | |
static void swrDrawTriangleRef_Hierarchical(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r; | |
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g; | |
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b; | |
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a; | |
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r; | |
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g; | |
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b; | |
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a; | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const float inv_area = 1.0f / (float)iarea; | |
#endif | |
// Rasterize | |
for (int32_t blockMinY = bboxMinY_aligned, blockMaxY = bboxMinY_aligned + kBlockSize - 1; | |
blockMinY < bboxMaxY; | |
blockMinY += kBlockSize, blockMaxY += kBlockSize) { | |
for (int32_t blockMinX = bboxMinX_aligned, blockMaxX = bboxMinX_aligned + kBlockSize - 1; | |
blockMinX < bboxMaxX; | |
blockMinX += kBlockSize, blockMaxX += kBlockSize) { | |
// Evaluate 1st edge function at the 4 block corners. If all of the signed | |
// distances are negative (all sign bits are 1) then the block will be empty. | |
const int32_t w0_A = swr_edgeEval(edge0, blockMinX, blockMinY); | |
const int32_t w0_B = swr_edgeEval(edge0, blockMaxX, blockMinY); | |
const int32_t w0_C = swr_edgeEval(edge0, blockMaxX, blockMaxY); | |
const int32_t w0_D = swr_edgeEval(edge0, blockMinX, blockMaxY); | |
const uint32_t w0_blockMsk = SWR_BLOCK_MASK(w0_A, w0_B, w0_C, w0_D); | |
assert(SWR_BLOCK_MASK_IS_VALID(w0_blockMsk)); | |
if (w0_blockMsk == SWR_BLOCK_MASK_EMPTY) { | |
continue; | |
} | |
// Evaluate 2nd edge function at the 4 block corners. If all of the signed | |
// distances are negative (all sign bits are 1) then the block will be empty. | |
const int32_t w1_A = swr_edgeEval(edge1, blockMinX, blockMinY); | |
const int32_t w1_B = swr_edgeEval(edge1, blockMaxX, blockMinY); | |
const int32_t w1_C = swr_edgeEval(edge1, blockMaxX, blockMaxY); | |
const int32_t w1_D = swr_edgeEval(edge1, blockMinX, blockMaxY); | |
const uint32_t w1_blockMsk = SWR_BLOCK_MASK(w1_A, w1_B, w1_C, w1_D); | |
assert(SWR_BLOCK_MASK_IS_VALID(w1_blockMsk)); | |
if (w1_blockMsk == SWR_BLOCK_MASK_EMPTY) { | |
continue; | |
} | |
// Evaluate 3rd edge function at the 4 block corners. If all of the signed | |
// distances are negative (all sign bits are 1) then the block will be empty. | |
const int32_t w2_A = swr_edgeEval(edge2, blockMinX, blockMinY); | |
const int32_t w2_B = swr_edgeEval(edge2, blockMaxX, blockMinY); | |
const int32_t w2_C = swr_edgeEval(edge2, blockMaxX, blockMaxY); | |
const int32_t w2_D = swr_edgeEval(edge2, blockMinX, blockMaxY); | |
const uint32_t w2_blockMsk = SWR_BLOCK_MASK(w2_A, w2_B, w2_C, w2_D); | |
assert(SWR_BLOCK_MASK_IS_VALID(w2_blockMsk)); | |
if (w2_blockMsk == SWR_BLOCK_MASK_EMPTY) { | |
continue; | |
} | |
if (!SWR_BLOCK_MASK_ALL_FULL(w0_blockMsk, w1_blockMsk, w2_blockMsk)) { | |
// Partial block | |
int32_t pymin = 0; | |
int32_t pymax = kBlockSize - 1; | |
{ | |
if (SWR_BLOCK_MASK_Y_MAX(w0_blockMsk)) { | |
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy)); | |
pymax = swr_mini(pymax, w_pymax); | |
} else if (SWR_BLOCK_MASK_Y_MIN(w0_blockMsk)) { | |
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy)); | |
pymin = swr_maxi(pymin, w_pymin); | |
} | |
if (SWR_BLOCK_MASK_Y_MAX(w1_blockMsk)) { | |
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy)); | |
pymax = swr_mini(pymax, w_pymax); | |
} else if (SWR_BLOCK_MASK_Y_MIN(w1_blockMsk)) { | |
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy)); | |
pymin = swr_maxi(pymin, w_pymin); | |
} | |
if (SWR_BLOCK_MASK_Y_MAX(w2_blockMsk)) { | |
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy)); | |
pymax = swr_mini(pymax, w_pymax); | |
} else if (SWR_BLOCK_MASK_Y_MIN(w2_blockMsk)) { | |
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy)); | |
pymin = swr_maxi(pymin, w_pymin); | |
} | |
} | |
// Evaluate edge functions at the first row. | |
int32_t w0_blockMinX_py = swr_edgeEval(edge0, blockMinX, blockMinY + pymin); | |
int32_t w1_blockMinX_py = swr_edgeEval(edge1, blockMinX, blockMinY + pymin); | |
int32_t w2_blockMinX_py = swr_edgeEval(edge2, blockMinX, blockMinY + pymin); | |
int32_t w0_blockMaxX_py = w0_blockMinX_py + edge0.m_dx * (kBlockSize - 1); | |
int32_t w1_blockMaxX_py = w1_blockMinX_py + edge1.m_dx * (kBlockSize - 1); | |
int32_t w2_blockMaxX_py = w2_blockMinX_py + edge2.m_dx * (kBlockSize - 1); | |
for (int32_t py = pymin; py <= pymax; ++py) { | |
const int32_t w0_rowMsk = SWR_ROW_MASK(w0_blockMinX_py, w0_blockMaxX_py); | |
const int32_t w1_rowMsk = SWR_ROW_MASK(w1_blockMinX_py, w1_blockMaxX_py); | |
const int32_t w2_rowMsk = SWR_ROW_MASK(w2_blockMinX_py, w2_blockMaxX_py); | |
assert(w0_rowMsk != SWR_ROW_MASK_EMPTY); | |
assert(w1_rowMsk != SWR_ROW_MASK_EMPTY); | |
assert(w2_rowMsk != SWR_ROW_MASK_EMPTY); | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + (blockMinY + py) * ctx->m_Width]; | |
int32_t pxmin = 0; | |
int32_t pxmax = (int32_t)kBlockSize - 1; | |
if (!SWR_ROW_MASK_ALL_FULL(w0_rowMsk, w1_rowMsk, w2_rowMsk)) { | |
if (SWR_ROW_MASK_X_MAX(w0_rowMsk)) { | |
const int32_t w_pxmax = swr_idiv_floor(w0_blockMinX_py, -edge0.m_dx); | |
pxmax = swr_mini(pxmax, w_pxmax); | |
} else if (SWR_ROW_MASK_X_MIN(w0_rowMsk)) { | |
const int32_t w_pxmin = swr_idiv_ceil(-w0_blockMinX_py, edge0.m_dx); | |
pxmin = swr_maxi(pxmin, w_pxmin); | |
} | |
if (SWR_ROW_MASK_X_MAX(w1_rowMsk)) { | |
const int32_t w_pxmax = swr_idiv_floor(w1_blockMinX_py, -edge1.m_dx); | |
pxmax = swr_mini(pxmax, w_pxmax); | |
} else if (SWR_ROW_MASK_X_MIN(w1_rowMsk)) { | |
const int32_t w_pxmin = swr_idiv_ceil(-w1_blockMinX_py, edge1.m_dx); | |
pxmin = swr_maxi(pxmin, w_pxmin); | |
} | |
if (SWR_ROW_MASK_X_MAX(w2_rowMsk)) { | |
const int32_t w_pxmax = swr_idiv_floor(w2_blockMinX_py, -edge2.m_dx); | |
pxmax = swr_mini(pxmax, w_pxmax); | |
} else if (SWR_ROW_MASK_X_MIN(w2_rowMsk)) { | |
const int32_t w_pxmin = swr_idiv_ceil(-w2_blockMinX_py, edge2.m_dx); | |
pxmin = swr_maxi(pxmin, w_pxmin); | |
} | |
} | |
// Calculate barycentric coords at pxmin | |
int32_t w0 = w0_blockMinX_py + pxmin * edge0.m_dx; | |
int32_t w1 = w1_blockMinX_py + pxmin * edge1.m_dx; | |
int32_t w2 = w2_blockMinX_py + pxmin * edge2.m_dx; | |
for (int32_t px = pxmin; px <= pxmax; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
w0_blockMinX_py += edge0.m_dy; | |
w1_blockMinX_py += edge1.m_dy; | |
w2_blockMinX_py += edge2.m_dy; | |
w0_blockMaxX_py += edge0.m_dy; | |
w1_blockMaxX_py += edge1.m_dy; | |
w2_blockMaxX_py += edge2.m_dy; | |
} | |
} else { | |
// Full block | |
int32_t w0_row = w0_A; | |
int32_t w1_row = w1_A; | |
int32_t w2_row = w2_A; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
// Calculate barycentric coords at pxmin | |
int32_t w0 = w0_row; | |
int32_t w1 = w1_row; | |
int32_t w2 = w2_row; | |
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
w0_row += edge0.m_dy; | |
w1_row += edge1.m_dy; | |
w2_row += edge2.m_dy; | |
fb_row += ctx->m_Width; | |
} | |
} | |
} | |
} | |
} | |
#define SWR_ANY_NEGATIVE3(a, b, c) (((a) | (b) | (c)) < 0) | |
// 2-level hierarchical rasterization using trivial reject/accept corners. | |
// | |
// Fully covered blocks are rasterized without any conditionals in the inner loops. | |
// | |
// Partially covered blocks are rasterized conditionally by keeping track of the | |
// edge function values at each block row's min. Only completely uncovered rows | |
// are skipped. | |
static void swrDrawTriangleRef_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r; | |
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g; | |
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b; | |
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a; | |
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r; | |
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g; | |
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b; | |
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a; | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const float inv_area = 1.0f / (float)iarea; | |
#endif | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1); | |
const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1); | |
const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1); | |
const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1); | |
const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1); | |
const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1); | |
const int32_t trivialRejectOffset0 = 0 | |
+ (edge0.m_dx >= 0 ? w0_blockMax_dx : 0) | |
+ (edge0.m_dy >= 0 ? w0_blockMax_dy : 0) | |
; | |
const int32_t trivialRejectOffset1 = 0 | |
+ (edge1.m_dx >= 0 ? w1_blockMax_dx : 0) | |
+ (edge1.m_dy >= 0 ? w1_blockMax_dy : 0) | |
; | |
const int32_t trivialRejectOffset2 = 0 | |
+ (edge2.m_dx >= 0 ? w2_blockMax_dx : 0) | |
+ (edge2.m_dy >= 0 ? w2_blockMax_dy : 0) | |
; | |
const int32_t trivialAcceptOffset0 = (w0_blockMax_dx + w0_blockMax_dy) - trivialRejectOffset0; | |
const int32_t trivialAcceptOffset1 = (w1_blockMax_dx + w1_blockMax_dy) - trivialRejectOffset1; | |
const int32_t trivialAcceptOffset2 = (w2_blockMax_dx + w2_blockMax_dy) - trivialRejectOffset2; | |
const int32_t trivialRejectOffset0_dx = 0 | |
+ (edge0.m_dx >= 0 ? w0_blockMax_dx : 0) | |
; | |
const int32_t trivialRejectOffset1_dx = 0 | |
+ (edge1.m_dx >= 0 ? w1_blockMax_dx : 0) | |
; | |
const int32_t trivialRejectOffset2_dx = 0 | |
+ (edge2.m_dx >= 0 ? w2_blockMax_dx : 0) | |
; | |
// Rasterize | |
const int32_t w0_bboxMin = swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned); | |
const int32_t w1_bboxMin = swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned); | |
const int32_t w2_bboxMin = swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned); | |
const int32_t w0_nextBlock_dx = edge0.m_dx * kBlockSize; | |
const int32_t w0_nextBlock_dy = edge0.m_dy * kBlockSize; | |
const int32_t w1_nextBlock_dx = edge1.m_dx * kBlockSize; | |
const int32_t w1_nextBlock_dy = edge1.m_dy * kBlockSize; | |
const int32_t w2_nextBlock_dx = edge2.m_dx * kBlockSize; | |
const int32_t w2_nextBlock_dy = edge2.m_dy * kBlockSize; | |
int32_t w0_blockY = w0_bboxMin; | |
int32_t w1_blockY = w1_bboxMin; | |
int32_t w2_blockY = w2_bboxMin; | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) { | |
int32_t w0_blockMin = w0_blockY; | |
int32_t w1_blockMin = w1_blockY; | |
int32_t w2_blockMin = w2_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const int32_t w0_trivialReject = w0_blockMin + trivialRejectOffset0; | |
const int32_t w1_trivialReject = w1_blockMin + trivialRejectOffset1; | |
const int32_t w2_trivialReject = w2_blockMin + trivialRejectOffset2; | |
if (SWR_ANY_NEGATIVE3(w0_trivialReject, w1_trivialReject, w2_trivialReject)) { | |
w0_blockMin += w0_nextBlock_dx; | |
w1_blockMin += w1_nextBlock_dx; | |
w2_blockMin += w2_nextBlock_dx; | |
continue; | |
} | |
// At this point we know that the triangle touches the tile. There are 2 cases: | |
// - The tile is fully covered by the triangle. | |
// - The tile is partially covered by the triangle. | |
// | |
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path). | |
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row. | |
// | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// The trivial accept corner is the opposite corner to the trivial reject corner. | |
// If all trivial accept corners are inside their respective edges then the block is fully | |
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case). | |
// | |
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from | |
// the block's max point. | |
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset | |
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite | |
// (trivial accept) corner: | |
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX | |
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY | |
// | |
const int32_t w0_trivialAccept = w0_blockMin + trivialAcceptOffset0; | |
const int32_t w1_trivialAccept = w1_blockMin + trivialAcceptOffset1; | |
const int32_t w2_trivialAccept = w2_blockMin + trivialAcceptOffset2; | |
if (SWR_ANY_NEGATIVE3(w0_trivialAccept, w1_trivialAccept, w2_trivialAccept)) { | |
// Partial block | |
int32_t w0_row = w0_blockMin; | |
int32_t w1_row = w1_blockMin; | |
int32_t w2_row = w2_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
const int32_t w0_rowTrivialReject = w0_row + trivialRejectOffset0_dx; | |
const int32_t w1_rowTrivialReject = w1_row + trivialRejectOffset1_dx; | |
const int32_t w2_rowTrivialReject = w2_row + trivialRejectOffset2_dx; | |
if (!SWR_ANY_NEGATIVE3(w0_rowTrivialReject, w1_rowTrivialReject, w2_rowTrivialReject)) { | |
int32_t w0 = w0_row; | |
int32_t w1 = w1_row; | |
int32_t w2 = w2_row; | |
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) { | |
if (!SWR_ANY_NEGATIVE3(w0, w1, w2)) { | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
} | |
w0_row += edge0.m_dy; | |
w1_row += edge1.m_dy; | |
w2_row += edge2.m_dy; | |
fb_row += ctx->m_Width; | |
} | |
} else { | |
// Full block | |
int32_t w0_row = w0_blockMin; | |
int32_t w1_row = w1_blockMin; | |
int32_t w2_row = w2_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
// Calculate barycentric coords at pxmin | |
int32_t w0 = w0_row; | |
int32_t w1 = w1_row; | |
int32_t w2 = w2_row; | |
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
w0_row += edge0.m_dy; | |
w1_row += edge1.m_dy; | |
w2_row += edge2.m_dy; | |
fb_row += ctx->m_Width; | |
} | |
} | |
w0_blockMin += w0_nextBlock_dx; | |
w1_blockMin += w1_nextBlock_dx; | |
w2_blockMin += w2_nextBlock_dx; | |
} | |
w0_blockY += w0_nextBlock_dy; | |
w1_blockY += w1_nextBlock_dy; | |
w2_blockY += w2_nextBlock_dy; | |
} | |
} | |
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
const vec4f v_r2 = vec4f_getXXXX(v_c2); | |
const vec4f v_g2 = vec4f_getYYYY(v_c2); | |
const vec4f v_b2 = vec4f_getZZZZ(v_c2); | |
const vec4f v_a2 = vec4f_getWWWW(v_c2); | |
const vec4f v_dr02 = vec4f_getXXXX(v_c02); | |
const vec4f v_dg02 = vec4f_getYYYY(v_c02); | |
const vec4f v_db02 = vec4f_getZZZZ(v_c02); | |
const vec4f v_da02 = vec4f_getWWWW(v_c02); | |
const vec4f v_dr12 = vec4f_getXXXX(v_c12); | |
const vec4f v_dg12 = vec4f_getYYYY(v_c12); | |
const vec4f v_db12 = vec4f_getZZZZ(v_c12); | |
const vec4f v_da12 = vec4f_getWWWW(v_c12); | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1); | |
const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1); | |
const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1); | |
const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1); | |
const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1); | |
const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1); | |
const vec4i v_trivialRejectOffset = vec4i_fromInt4( | |
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0) + (edge0.m_dy >= 0 ? w0_blockMax_dy : 0), | |
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0) + (edge1.m_dy >= 0 ? w1_blockMax_dy : 0), | |
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0) + (edge2.m_dy >= 0 ? w2_blockMax_dy : 0), | |
0 | |
); | |
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_fromInt4( | |
(w0_blockMax_dx + w0_blockMax_dy), | |
(w1_blockMax_dx + w1_blockMax_dy), | |
(w2_blockMax_dx + w2_blockMax_dy), | |
0), v_trivialRejectOffset | |
); | |
const vec4i v_trivialRejectOffset_dx = vec4i_fromInt4( | |
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0), | |
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0), | |
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0), | |
0 | |
); | |
const vec4i v_edge0_dx0123 = vec4i_fromInt4(0, edge0.m_dx, edge0.m_dx * 2, edge0.m_dx * 3); | |
const vec4i v_edge1_dx0123 = vec4i_fromInt4(0, edge1.m_dx, edge1.m_dx * 2, edge1.m_dx * 3); | |
const vec4i v_edge2_dx0123 = vec4i_fromInt4(0, edge2.m_dx, edge2.m_dx * 2, edge2.m_dx * 3); | |
const vec4i v_edge0_dx4 = vec4i_fromInt(edge0.m_dx * 4); | |
const vec4i v_edge1_dx4 = vec4i_fromInt(edge1.m_dx * 4); | |
const vec4i v_edge2_dx4 = vec4i_fromInt(edge2.m_dx * 4); | |
const vec4i v_edge012__dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0); | |
// Rasterize | |
const vec4i v_w_bboxMin = vec4i_fromInt4( | |
swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned), | |
swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned), | |
swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned), | |
0 | |
); | |
const vec4i v_w_nextBlock_dx = vec4i_fromInt4(edge0.m_dx * kBlockSize, edge1.m_dx * kBlockSize, edge2.m_dx * kBlockSize, 0); | |
const vec4i v_w_nextBlock_dy = vec4i_fromInt4(edge0.m_dy * kBlockSize, edge1.m_dy * kBlockSize, edge2.m_dy * kBlockSize, 0); | |
vec4i v_w_blockY = v_w_bboxMin; | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) { | |
vec4i v_w_blockMin = v_w_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset); | |
if (vec4i_any_neg_SSE2(v_w_trivialReject)) { | |
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx); | |
continue; | |
} | |
// At this point we know that the triangle touches the tile. There are 2 cases: | |
// - The tile is fully covered by the triangle. | |
// - The tile is partially covered by the triangle. | |
// | |
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path). | |
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row. | |
// | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// The trivial accept corner is the opposite corner to the trivial reject corner. | |
// If all trivial accept corners are inside their respective edges then the block is fully | |
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case). | |
// | |
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from | |
// the block's max point. | |
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset | |
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite | |
// (trivial accept) corner: | |
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX | |
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY | |
// | |
const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset); | |
if (vec4i_any_neg_SSE2(v_w_trivialAccept)) { | |
// Partial block | |
vec4i v_w_row = v_w_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
const vec4i v_w_rowTrivialReject = vec4i_add(v_w_row, v_trivialRejectOffset_dx); | |
if (!vec4i_any_neg_SSE2(v_w_rowTrivialReject)) | |
{ | |
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123); | |
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123); | |
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123); | |
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) { | |
// Calculate the (inverse) pixel mask. | |
// If any of the barycentric coordinates is negative, the pixel mask will | |
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop | |
// to blend between the existing framebuffer values and the new values. | |
const vec4i v_izero = vec4i_zero(); | |
const vec4i v_w0_lt = vec4i_cmplt(v_w0, v_izero); | |
const vec4i v_w1_lt = vec4i_cmplt(v_w1, v_izero); | |
const vec4i v_w2_lt = vec4i_cmplt(v_w2, v_izero); | |
const vec4i v_notPixelMask = vec4i_or(v_w0_lt, vec4i_or(v_w1_lt, v_w2_lt)); | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area); | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
// Pack independent R32/G32/B32/A32 values of the 4 pixels into RGBA8. | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
// Store result using the pixel mask | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[px]); | |
} | |
v_w0 = vec4i_add(v_w0, v_edge0_dx4); | |
v_w1 = vec4i_add(v_w1, v_edge1_dx4); | |
v_w2 = vec4i_add(v_w2, v_edge2_dx4); | |
} | |
} | |
v_w_row = vec4i_add(v_w_row, v_edge012__dy); | |
fb_row += ctx->m_Width; | |
} | |
} else { | |
// Full block | |
vec4i v_w_row = v_w_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
// Calculate barycentric coords at pxmin | |
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123); | |
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123); | |
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123); | |
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area); | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va(v_rgba8, &fb_row[px]); | |
} | |
v_w0 = vec4i_add(v_w0, v_edge0_dx4); | |
v_w1 = vec4i_add(v_w1, v_edge1_dx4); | |
v_w2 = vec4i_add(v_w2, v_edge2_dx4); | |
} | |
v_w_row = vec4i_add(v_w_row, v_edge012__dy); | |
fb_row += ctx->m_Width; | |
} | |
} | |
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx); | |
} | |
v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy); | |
} | |
} | |
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 4); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 4); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
const vec4f v_r2 = vec4f_getXXXX(v_c2); | |
const vec4f v_g2 = vec4f_getYYYY(v_c2); | |
const vec4f v_b2 = vec4f_getZZZZ(v_c2); | |
const vec4f v_a2 = vec4f_getWWWW(v_c2); | |
const vec4f v_dr02 = vec4f_getXXXX(v_c02); | |
const vec4f v_dg02 = vec4f_getYYYY(v_c02); | |
const vec4f v_db02 = vec4f_getZZZZ(v_c02); | |
const vec4f v_da02 = vec4f_getWWWW(v_c02); | |
const vec4f v_dr12 = vec4f_getXXXX(v_c12); | |
const vec4f v_dg12 = vec4f_getYYYY(v_c12); | |
const vec4f v_db12 = vec4f_getZZZZ(v_c12); | |
const vec4f v_da12 = vec4f_getWWWW(v_c12); | |
// Barycentric coordinate normalization | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const vec4i v_zero = vec4i_zero(); | |
const vec4i v_blockSize = vec4i_fromInt(4); | |
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1); | |
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3); | |
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0); | |
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0); | |
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1); | |
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1); | |
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero); | |
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero); | |
const vec4i v_trivialRejectOffset = vec4i_add( | |
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx), | |
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy) | |
); | |
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset); | |
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy); | |
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy); | |
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy); | |
// Rasterize | |
const vec4i v_w_bboxMin = vec4i_fromInt4( | |
swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned), | |
swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned), | |
swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned), | |
0 | |
); | |
const vec4i v_w_nextBlock_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize); | |
const vec4i v_w_nextBlock_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize); | |
vec4i v_w_blockY = v_w_bboxMin; | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += 4) { | |
vec4i v_w_blockMin = v_w_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += 4) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset); | |
if (vec4i_any_neg_SSE2(v_w_trivialReject)) { | |
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx); | |
continue; | |
} | |
// At this point we know that the triangle touches the tile. There are 2 cases: | |
// - The tile is fully covered by the triangle. | |
// - The tile is partially covered by the triangle. | |
// | |
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path). | |
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row. | |
// | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// The trivial accept corner is the opposite corner to the trivial reject corner. | |
// If all trivial accept corners are inside their respective edges then the block is fully | |
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case). | |
// | |
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from | |
// the block's max point. | |
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset | |
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite | |
// (trivial accept) corner: | |
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX | |
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY | |
// | |
const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset); | |
if (vec4i_any_neg_SSE2(v_w_trivialAccept)) { | |
// Partial block | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
vec4i v_w0_row0 = vec4i_add(vec4i_getXXXX(v_w_blockMin), v_edge0_dx0123); | |
vec4i v_w1_row0 = vec4i_add(vec4i_getYYYY(v_w_blockMin), v_edge1_dx0123); | |
vec4i v_w2_row0 = vec4i_add(vec4i_getZZZZ(v_w_blockMin), v_edge2_dx0123); | |
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy); | |
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy); | |
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy); | |
// Calculate the (inverse) pixel mask. | |
// If any of the barycentric coordinates is negative, the pixel mask will | |
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop | |
// to blend between the existing framebuffer values and the new values. | |
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0); | |
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1); | |
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2); | |
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3); | |
if (!vec4i_all_neg_SSE2(v_w_row0_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]); | |
} | |
if (!vec4i_all_neg_SSE2(v_w_row1_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]); | |
} | |
if (!vec4i_all_neg_SSE2(v_w_row2_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]); | |
} | |
if (!vec4i_all_neg_SSE2(v_w_row3_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]); | |
} | |
} else { | |
#if 1 | |
// Full block | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_w0_row0 = vec4i_add(vec4i_getXXXX(v_w_blockMin), v_edge0_dx0123); | |
const vec4i v_w1_row0 = vec4i_add(vec4i_getYYYY(v_w_blockMin), v_edge1_dx0123); | |
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
#endif | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row0 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2))); | |
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2))); | |
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2))); | |
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2))); | |
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0); | |
#endif | |
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row1 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2))); | |
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2))); | |
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2))); | |
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2))); | |
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1); | |
#endif | |
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row2 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2))); | |
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2))); | |
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2))); | |
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2))); | |
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2); | |
#endif | |
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row3 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2))); | |
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2))); | |
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2))); | |
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2))); | |
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3); | |
#endif | |
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]); | |
#endif | |
} | |
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx); | |
} | |
v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy); | |
} | |
} | |
static void swrDrawTriangleSSE2_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
const vec4f v_r2 = vec4f_getXXXX(v_c2); | |
const vec4f v_g2 = vec4f_getYYYY(v_c2); | |
const vec4f v_b2 = vec4f_getZZZZ(v_c2); | |
const vec4f v_a2 = vec4f_getWWWW(v_c2); | |
const vec4f v_dr02 = vec4f_getXXXX(v_c02); | |
const vec4f v_dg02 = vec4f_getYYYY(v_c02); | |
const vec4f v_db02 = vec4f_getZZZZ(v_c02); | |
const vec4f v_da02 = vec4f_getWWWW(v_c02); | |
const vec4f v_dr12 = vec4f_getXXXX(v_c12); | |
const vec4f v_dg12 = vec4f_getYYYY(v_c12); | |
const vec4f v_db12 = vec4f_getZZZZ(v_c12); | |
const vec4f v_da12 = vec4f_getWWWW(v_c12); | |
// Barycentric coordinate normalization | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0); | |
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0); | |
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1); | |
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1); | |
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1); | |
const vec4i v_zero = vec4i_zero(); | |
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero); | |
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero); | |
const vec4i v_trivialRejectOffset = vec4i_add( | |
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx), | |
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy) | |
); | |
const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset); | |
const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset); | |
const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset); | |
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset); | |
const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset); | |
const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset); | |
const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset); | |
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3); | |
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy); | |
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy); | |
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy); | |
// Rasterize | |
const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16); | |
const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16); | |
const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16); | |
const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4); | |
const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4); | |
const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4); | |
const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12); | |
vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets)); | |
vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets)); | |
vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets)); | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) { | |
uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width]; | |
vec4i v_w0_blockMin = v_w0_blockY; | |
vec4i v_w1_blockMin = v_w1_blockY; | |
vec4i v_w2_blockMin = v_w2_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0); | |
const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1); | |
const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2); | |
const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject); | |
uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F; | |
if (trivialRejectBlockMask == 0) { | |
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx); | |
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx); | |
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx); | |
continue; | |
} | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// If the trivial accept corner of all edges is positive (inside the edge) then the triangle | |
// fully covers the block. | |
const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0); | |
const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1); | |
const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2); | |
const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept); | |
uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept); | |
int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4]; | |
vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]); | |
vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]); | |
vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]); | |
for (uint32_t iBlock = 0;trivialRejectBlockMask != 0; | |
++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) | |
{ | |
if ((trivialRejectBlockMask & 1) == 0) { | |
continue; | |
} | |
uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4]; | |
vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123); | |
vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123); | |
if ((trivialAcceptBlockMask & 1) != 0) { | |
// Partial block | |
vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123); | |
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy); | |
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy); | |
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy); | |
// Calculate the (inverse) pixel mask. | |
// If any of the barycentric coordinates is negative, the pixel mask will | |
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop | |
// to blend between the existing framebuffer values and the new values. | |
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0); | |
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1); | |
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2); | |
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3); | |
// Row 0 | |
if (!vec4i_all_neg_SSE2(v_w_row0_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]); | |
} | |
// Row 1 | |
if (!vec4i_all_neg_SSE2(v_w_row1_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]); | |
} | |
// Row 2 | |
if (!vec4i_all_neg_SSE2(v_w_row2_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]); | |
} | |
// Row 3 | |
if (!vec4i_all_neg_SSE2(v_w_row3_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]); | |
} | |
} else { | |
// Full block | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
#endif | |
// Row 0 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row0 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2))); | |
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2))); | |
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2))); | |
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2))); | |
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0); | |
#endif | |
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]); | |
} | |
// Row 1 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row1 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2))); | |
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2))); | |
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2))); | |
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2))); | |
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1); | |
#endif | |
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]); | |
} | |
// Row 2 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row2 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2))); | |
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2))); | |
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2))); | |
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2))); | |
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2); | |
#endif | |
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]); | |
} | |
// Row 3 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row3 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2))); | |
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2))); | |
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2))); | |
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2))); | |
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3); | |
#endif | |
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]); | |
} | |
} | |
} | |
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx); | |
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx); | |
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx); | |
} | |
v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy); | |
v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy); | |
v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy); | |
} | |
} | |
// Same as the corresponding SSE2 version except from the usage of pshufb | |
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
const vec4f v_r2 = vec4f_getXXXX(v_c2); | |
const vec4f v_g2 = vec4f_getYYYY(v_c2); | |
const vec4f v_b2 = vec4f_getZZZZ(v_c2); | |
const vec4f v_a2 = vec4f_getWWWW(v_c2); | |
const vec4f v_dr02 = vec4f_getXXXX(v_c02); | |
const vec4f v_dg02 = vec4f_getYYYY(v_c02); | |
const vec4f v_db02 = vec4f_getZZZZ(v_c02); | |
const vec4f v_da02 = vec4f_getWWWW(v_c02); | |
const vec4f v_dr12 = vec4f_getXXXX(v_c12); | |
const vec4f v_dg12 = vec4f_getYYYY(v_c12); | |
const vec4f v_db12 = vec4f_getZZZZ(v_c12); | |
const vec4f v_da12 = vec4f_getWWWW(v_c12); | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const int32_t w0_blockMax_dx = edge0.m_dx * (kBlockSize - 1); | |
const int32_t w0_blockMax_dy = edge0.m_dy * (kBlockSize - 1); | |
const int32_t w1_blockMax_dx = edge1.m_dx * (kBlockSize - 1); | |
const int32_t w1_blockMax_dy = edge1.m_dy * (kBlockSize - 1); | |
const int32_t w2_blockMax_dx = edge2.m_dx * (kBlockSize - 1); | |
const int32_t w2_blockMax_dy = edge2.m_dy * (kBlockSize - 1); | |
const vec4i v_trivialRejectOffset = vec4i_fromInt4( | |
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0) + (edge0.m_dy >= 0 ? w0_blockMax_dy : 0), | |
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0) + (edge1.m_dy >= 0 ? w1_blockMax_dy : 0), | |
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0) + (edge2.m_dy >= 0 ? w2_blockMax_dy : 0), | |
0 | |
); | |
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_fromInt4( | |
(w0_blockMax_dx + w0_blockMax_dy), | |
(w1_blockMax_dx + w1_blockMax_dy), | |
(w2_blockMax_dx + w2_blockMax_dy), | |
0), v_trivialRejectOffset | |
); | |
const vec4i v_trivialRejectOffset_dx = vec4i_fromInt4( | |
(edge0.m_dx >= 0 ? w0_blockMax_dx : 0), | |
(edge1.m_dx >= 0 ? w1_blockMax_dx : 0), | |
(edge2.m_dx >= 0 ? w2_blockMax_dx : 0), | |
0 | |
); | |
const vec4i v_edge0_dx0123 = vec4i_fromInt4(0, edge0.m_dx, edge0.m_dx * 2, edge0.m_dx * 3); | |
const vec4i v_edge1_dx0123 = vec4i_fromInt4(0, edge1.m_dx, edge1.m_dx * 2, edge1.m_dx * 3); | |
const vec4i v_edge2_dx0123 = vec4i_fromInt4(0, edge2.m_dx, edge2.m_dx * 2, edge2.m_dx * 3); | |
const vec4i v_edge0_dx4 = vec4i_fromInt(edge0.m_dx * 4); | |
const vec4i v_edge1_dx4 = vec4i_fromInt(edge1.m_dx * 4); | |
const vec4i v_edge2_dx4 = vec4i_fromInt(edge2.m_dx * 4); | |
const vec4i v_edge012__dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0); | |
// Rasterize | |
const vec4i v_w_bboxMin = vec4i_fromInt4( | |
swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned), | |
swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned), | |
swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned), | |
0 | |
); | |
const vec4i v_w_nextBlock_dx = vec4i_fromInt4(edge0.m_dx * kBlockSize, edge1.m_dx * kBlockSize, edge2.m_dx * kBlockSize, 0); | |
const vec4i v_w_nextBlock_dy = vec4i_fromInt4(edge0.m_dy * kBlockSize, edge1.m_dy * kBlockSize, edge2.m_dy * kBlockSize, 0); | |
vec4i v_w_blockY = v_w_bboxMin; | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) { | |
vec4i v_w_blockMin = v_w_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const vec4i v_w_trivialReject = vec4i_add(v_w_blockMin, v_trivialRejectOffset); | |
if (vec4i_any_neg_SSE2(v_w_trivialReject)) { | |
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx); | |
continue; | |
} | |
// At this point we know that the triangle touches the tile. There are 2 cases: | |
// - The tile is fully covered by the triangle. | |
// - The tile is partially covered by the triangle. | |
// | |
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path). | |
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row. | |
// | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// The trivial accept corner is the opposite corner to the trivial reject corner. | |
// If all trivial accept corners are inside their respective edges then the block is fully | |
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case). | |
// | |
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from | |
// the block's max point. | |
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset | |
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite | |
// (trivial accept) corner: | |
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX | |
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY | |
// | |
const vec4i v_w_trivialAccept = vec4i_add(v_w_blockMin, v_trivialAcceptOffset); | |
if (vec4i_any_neg_SSE2(v_w_trivialAccept)) { | |
// Partial block | |
vec4i v_w_row = v_w_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
const vec4i v_w_rowTrivialReject = vec4i_add(v_w_row, v_trivialRejectOffset_dx); | |
if (!vec4i_any_neg_SSE2(v_w_rowTrivialReject)) { | |
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123); | |
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123); | |
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123); | |
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) { | |
// Calculate the (inverse) pixel mask. | |
// If any of the barycentric coordinates is negative, the pixel mask will | |
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop | |
// to blend between the existing framebuffer values and the new values. | |
const vec4i v_izero = vec4i_zero(); | |
const vec4i v_w0_lt = vec4i_cmplt(v_w0, v_izero); | |
const vec4i v_w1_lt = vec4i_cmplt(v_w1, v_izero); | |
const vec4i v_w2_lt = vec4i_cmplt(v_w2, v_izero); | |
const vec4i v_notPixelMask = vec4i_or(v_w0_lt, vec4i_or(v_w1_lt, v_w2_lt)); | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_oldFB = vec4i_fromInt4va(&fb_row[px]); | |
const vec4i v_newFB = vec4i_or(vec4i_and(v_notPixelMask, v_oldFB), vec4i_andnot(v_notPixelMask, vec4i_fromInt(-1))); | |
vec4i_toInt4va(v_newFB, &fb_row[px]); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area); | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
// Pack independent R32/G32/B32/A32 values of the 4 pixels into RGBA8. | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
// Store result using the pixel mask | |
const vec4i v_newFB = vec4i_or( | |
vec4i_and(v_notPixelMask, vec4i_fromInt4va(&fb_row[px])), | |
vec4i_andnot(v_notPixelMask, v_rgba8) | |
); | |
vec4i_toInt4va(v_newFB, &fb_row[px]); | |
#endif | |
} | |
v_w0 = vec4i_add(v_w0, v_edge0_dx4); | |
v_w1 = vec4i_add(v_w1, v_edge1_dx4); | |
v_w2 = vec4i_add(v_w2, v_edge2_dx4); | |
} | |
} | |
v_w_row = vec4i_add(v_w_row, v_edge012__dy); | |
fb_row += ctx->m_Width; | |
} | |
} else { | |
// Full block | |
vec4i v_w_row = v_w_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
// Calculate barycentric coords at pxmin | |
vec4i v_w0 = vec4i_add(vec4i_getXXXX(v_w_row), v_edge0_dx0123); | |
vec4i v_w1 = vec4i_add(vec4i_getYYYY(v_w_row), v_edge1_dx0123); | |
vec4i v_w2 = vec4i_add(vec4i_getZZZZ(v_w_row), v_edge2_dx0123); | |
for (int32_t px = 0; px < (int32_t)kBlockSize; px += 4) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
vec4i_toInt4va(vec4i_fromInt(-1), &fb_row[px]); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1), v_inv_area); | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSE2(v_cr, v_cg, v_cb, v_ca); | |
vec4i_toInt4va(v_rgba8, &fb_row[px]); | |
#endif | |
} | |
v_w0 = vec4i_add(v_w0, v_edge0_dx4); | |
v_w1 = vec4i_add(v_w1, v_edge1_dx4); | |
v_w2 = vec4i_add(v_w2, v_edge2_dx4); | |
} | |
v_w_row = vec4i_add(v_w_row, v_edge012__dy); | |
fb_row += ctx->m_Width; | |
} | |
} | |
v_w_blockMin = vec4i_add(v_w_blockMin, v_w_nextBlock_dx); | |
} | |
v_w_blockY = vec4i_add(v_w_blockY, v_w_nextBlock_dy); | |
} | |
} | |
static void swrDrawTriangleSSSE3_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
const vec4f v_r2 = vec4f_getXXXX(v_c2); | |
const vec4f v_g2 = vec4f_getYYYY(v_c2); | |
const vec4f v_b2 = vec4f_getZZZZ(v_c2); | |
const vec4f v_a2 = vec4f_getWWWW(v_c2); | |
const vec4f v_dr02 = vec4f_getXXXX(v_c02); | |
const vec4f v_dg02 = vec4f_getYYYY(v_c02); | |
const vec4f v_db02 = vec4f_getZZZZ(v_c02); | |
const vec4f v_da02 = vec4f_getWWWW(v_c02); | |
const vec4f v_dr12 = vec4f_getXXXX(v_c12); | |
const vec4f v_dg12 = vec4f_getYYYY(v_c12); | |
const vec4f v_db12 = vec4f_getZZZZ(v_c12); | |
const vec4f v_da12 = vec4f_getWWWW(v_c12); | |
// Barycentric coordinate normalization | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0); | |
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0); | |
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1); | |
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1); | |
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1); | |
const vec4i v_zero = vec4i_zero(); | |
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero); | |
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero); | |
const vec4i v_trivialRejectOffset = vec4i_add( | |
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx), | |
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy) | |
); | |
const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset); | |
const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset); | |
const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset); | |
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset); | |
const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset); | |
const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset); | |
const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset); | |
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3); | |
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy); | |
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy); | |
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy); | |
// Rasterize | |
const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16); | |
const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16); | |
const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16); | |
const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4); | |
const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4); | |
const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4); | |
const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12); | |
vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets)); | |
vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets)); | |
vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets)); | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) { | |
uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width]; | |
vec4i v_w0_blockMin = v_w0_blockY; | |
vec4i v_w1_blockMin = v_w1_blockY; | |
vec4i v_w2_blockMin = v_w2_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0); | |
const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1); | |
const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2); | |
const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject); | |
uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F; | |
if (trivialRejectBlockMask == 0) { | |
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx); | |
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx); | |
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx); | |
continue; | |
} | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// If the trivial accept corner of all edges is positive (inside the edge) then the triangle | |
// fully covers the block. | |
const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0); | |
const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1); | |
const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2); | |
const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept); | |
uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept); | |
int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4]; | |
vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]); | |
vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]); | |
vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]); | |
for (uint32_t iBlock = 0; trivialRejectBlockMask != 0; | |
++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) { | |
if ((trivialRejectBlockMask & 1) == 0) { | |
continue; | |
} | |
uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4]; | |
vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123); | |
vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123); | |
if ((trivialAcceptBlockMask & 1) != 0) { | |
// Partial block | |
vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123); | |
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy); | |
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy); | |
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy); | |
// Calculate the (inverse) pixel mask. | |
// If any of the barycentric coordinates is negative, the pixel mask will | |
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop | |
// to blend between the existing framebuffer values and the new values. | |
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0); | |
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1); | |
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2); | |
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3); | |
// Row 0 | |
if (!vec4i_all_neg_SSE2(v_w_row0_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[0]); | |
} | |
// Row 1 | |
if (!vec4i_all_neg_SSE2(v_w_row1_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]); | |
} | |
// Row 2 | |
if (!vec4i_all_neg_SSE2(v_w_row2_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]); | |
} | |
// Row 3 | |
if (!vec4i_all_neg_SSE2(v_w_row3_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE2(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]); | |
} | |
} else { | |
// Full block | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
#endif | |
// Row 0 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row0 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2))); | |
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2))); | |
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2))); | |
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2))); | |
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0); | |
#endif | |
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]); | |
} | |
// Row 1 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row1 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2))); | |
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2))); | |
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2))); | |
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2))); | |
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1); | |
#endif | |
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]); | |
} | |
// Row 2 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row2 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2))); | |
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2))); | |
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2))); | |
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2))); | |
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2); | |
#endif | |
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]); | |
} | |
// Row 3 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row3 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2))); | |
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2))); | |
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2))); | |
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2))); | |
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3); | |
#endif | |
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]); | |
} | |
} | |
} | |
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx); | |
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx); | |
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx); | |
} | |
v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy); | |
v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy); | |
v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy); | |
} | |
} | |
static void swrDrawTriangleSSE41_HierarchicalLRB_Cond_4x4_v2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, 16); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, 4); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, 16); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, 4); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
const vec4f v_r2 = vec4f_getXXXX(v_c2); | |
const vec4f v_g2 = vec4f_getYYYY(v_c2); | |
const vec4f v_b2 = vec4f_getZZZZ(v_c2); | |
const vec4f v_a2 = vec4f_getWWWW(v_c2); | |
const vec4f v_dr02 = vec4f_getXXXX(v_c02); | |
const vec4f v_dg02 = vec4f_getYYYY(v_c02); | |
const vec4f v_db02 = vec4f_getZZZZ(v_c02); | |
const vec4f v_da02 = vec4f_getWWWW(v_c02); | |
const vec4f v_dr12 = vec4f_getXXXX(v_c12); | |
const vec4f v_dg12 = vec4f_getYYYY(v_c12); | |
const vec4f v_db12 = vec4f_getZZZZ(v_c12); | |
const vec4f v_da12 = vec4f_getWWWW(v_c12); | |
// Barycentric coordinate normalization | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const vec4i v_edge_dx = vec4i_fromInt4(edge0.m_dx, edge1.m_dx, edge2.m_dx, 0); | |
const vec4i v_edge_dy = vec4i_fromInt4(edge0.m_dy, edge1.m_dy, edge2.m_dy, 0); | |
const vec4i v_blockSize_m1 = vec4i_fromInt(4 - 1); | |
const vec4i v_w_blockMax_dx = vec4i_mullo_SSE2(v_edge_dx, v_blockSize_m1); | |
const vec4i v_w_blockMax_dy = vec4i_mullo_SSE2(v_edge_dy, v_blockSize_m1); | |
const vec4i v_zero = vec4i_zero(); | |
const vec4i v_edge_dx_lt = vec4i_cmplt(v_edge_dx, v_zero); | |
const vec4i v_edge_dy_lt = vec4i_cmplt(v_edge_dy, v_zero); | |
const vec4i v_trivialRejectOffset = vec4i_add( | |
vec4i_andnot(v_edge_dx_lt, v_w_blockMax_dx), | |
vec4i_andnot(v_edge_dy_lt, v_w_blockMax_dy) | |
); | |
const vec4i v_trivialRejectOffset_0 = vec4i_getXXXX(v_trivialRejectOffset); | |
const vec4i v_trivialRejectOffset_1 = vec4i_getYYYY(v_trivialRejectOffset); | |
const vec4i v_trivialRejectOffset_2 = vec4i_getZZZZ(v_trivialRejectOffset); | |
const vec4i v_trivialAcceptOffset = vec4i_sub(vec4i_add(v_w_blockMax_dx, v_w_blockMax_dy), v_trivialRejectOffset); | |
const vec4i v_trivialAcceptOffset_0 = vec4i_getXXXX(v_trivialAcceptOffset); | |
const vec4i v_trivialAcceptOffset_1 = vec4i_getYYYY(v_trivialAcceptOffset); | |
const vec4i v_trivialAcceptOffset_2 = vec4i_getZZZZ(v_trivialAcceptOffset); | |
const vec4i v_pixelOffsets = vec4i_fromInt4(0, 1, 2, 3); | |
const vec4i v_edge0_dx0123 = vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge1_dx0123 = vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge2_dx0123 = vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_pixelOffsets); | |
const vec4i v_edge0_dy = vec4i_getXXXX(v_edge_dy); | |
const vec4i v_edge1_dy = vec4i_getYYYY(v_edge_dy); | |
const vec4i v_edge2_dy = vec4i_getZZZZ(v_edge_dy); | |
// Rasterize | |
const vec4i v_w0_bboxMin = vec4i_fromInt(swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w1_bboxMin = vec4i_fromInt(swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w2_bboxMin = vec4i_fromInt(swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned)); | |
const vec4i v_w0_nextBlock_dx = vec4i_fromInt(edge0.m_dx * 16); | |
const vec4i v_w1_nextBlock_dx = vec4i_fromInt(edge1.m_dx * 16); | |
const vec4i v_w2_nextBlock_dx = vec4i_fromInt(edge2.m_dx * 16); | |
const vec4i v_w0_nextBlock_dy = vec4i_fromInt(edge0.m_dy * 4); | |
const vec4i v_w1_nextBlock_dy = vec4i_fromInt(edge1.m_dy * 4); | |
const vec4i v_w2_nextBlock_dy = vec4i_fromInt(edge2.m_dy * 4); | |
const vec4i v_blockOffsets = vec4i_fromInt4(0, 4, 8, 12); | |
vec4i v_w0_blockY = vec4i_add(v_w0_bboxMin, vec4i_mullo_SSE2(vec4i_getXXXX(v_edge_dx), v_blockOffsets)); | |
vec4i v_w1_blockY = vec4i_add(v_w1_bboxMin, vec4i_mullo_SSE2(vec4i_getYYYY(v_edge_dx), v_blockOffsets)); | |
vec4i v_w2_blockY = vec4i_add(v_w2_bboxMin, vec4i_mullo_SSE2(vec4i_getZZZZ(v_edge_dx), v_blockOffsets)); | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY_aligned; blockMinY += 4) { | |
uint32_t* fb_blockY = &ctx->m_FrameBuffer[blockMinY * ctx->m_Width]; | |
vec4i v_w0_blockMin = v_w0_blockY; | |
vec4i v_w1_blockMin = v_w1_blockY; | |
vec4i v_w2_blockMin = v_w2_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX_aligned; blockMinX += 16) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const vec4i v_w0_trivialReject = vec4i_add(v_w0_blockMin, v_trivialRejectOffset_0); | |
const vec4i v_w1_trivialReject = vec4i_add(v_w1_blockMin, v_trivialRejectOffset_1); | |
const vec4i v_w2_trivialReject = vec4i_add(v_w2_blockMin, v_trivialRejectOffset_2); | |
const vec4i v_w_trivialReject = vec4i_or3(v_w0_trivialReject, v_w1_trivialReject, v_w2_trivialReject); | |
uint32_t trivialRejectBlockMask = ~vec4i_getSignMask(v_w_trivialReject) & 0x0F; | |
if (trivialRejectBlockMask == 0) { | |
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx); | |
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx); | |
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx); | |
continue; | |
} | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// If the trivial accept corner of all edges is positive (inside the edge) then the triangle | |
// fully covers the block. | |
const vec4i v_w0_trivialAccept = vec4i_add(v_w0_blockMin, v_trivialAcceptOffset_0); | |
const vec4i v_w1_trivialAccept = vec4i_add(v_w1_blockMin, v_trivialAcceptOffset_1); | |
const vec4i v_w2_trivialAccept = vec4i_add(v_w2_blockMin, v_trivialAcceptOffset_2); | |
const vec4i v_w_trivialAccept = vec4i_or3(v_w0_trivialAccept, v_w1_trivialAccept, v_w2_trivialAccept); | |
uint32_t trivialAcceptBlockMask = vec4i_getSignMask(v_w_trivialAccept); | |
int32_t w0_blockMin[4], w1_blockMin[4], w2_blockMin[4]; | |
vec4i_toInt4va(v_w0_blockMin, &w0_blockMin[0]); | |
vec4i_toInt4va(v_w1_blockMin, &w1_blockMin[0]); | |
vec4i_toInt4va(v_w2_blockMin, &w2_blockMin[0]); | |
for (uint32_t iBlock = 0; trivialRejectBlockMask != 0; | |
++iBlock, trivialRejectBlockMask >>= 1, trivialAcceptBlockMask >>= 1) { | |
if ((trivialRejectBlockMask & 1) == 0) { | |
continue; | |
} | |
uint32_t* fb_row = &fb_blockY[blockMinX + iBlock * 4]; | |
vec4i v_w0_row0 = vec4i_add(vec4i_fromInt(w0_blockMin[iBlock]), v_edge0_dx0123); | |
vec4i v_w1_row0 = vec4i_add(vec4i_fromInt(w1_blockMin[iBlock]), v_edge1_dx0123); | |
if ((trivialAcceptBlockMask & 1) != 0) { | |
// Partial block | |
vec4i v_w2_row0 = vec4i_add(vec4i_fromInt(w2_blockMin[iBlock]), v_edge2_dx0123); | |
vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
vec4i v_w2_row1 = vec4i_add(v_w2_row0, v_edge2_dy); | |
vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
vec4i v_w2_row2 = vec4i_add(v_w2_row1, v_edge2_dy); | |
vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
vec4i v_w2_row3 = vec4i_add(v_w2_row2, v_edge2_dy); | |
// Calculate the (inverse) pixel mask. | |
// If any of the barycentric coordinates is negative, the pixel mask will | |
// be equal to 0xFFFFFFFF for that pixel. This mask is used at the end of the loop | |
// to blend between the existing framebuffer values and the new values. | |
const vec4i v_w_row0_or = vec4i_or3(v_w0_row0, v_w1_row0, v_w2_row0); | |
const vec4i v_w_row1_or = vec4i_or3(v_w0_row1, v_w1_row1, v_w2_row1); | |
const vec4i v_w_row2_or = vec4i_or3(v_w0_row2, v_w1_row2, v_w2_row2); | |
const vec4i v_w_row3_or = vec4i_or3(v_w0_row3, v_w1_row3, v_w2_row3); | |
// Row 0 | |
if (!vec4i_all_neg_SSE2(v_w_row0_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row0_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[0]); | |
} | |
// Row 1 | |
if (!vec4i_all_neg_SSE2(v_w_row1_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row1_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width]); | |
} | |
// Row 2 | |
if (!vec4i_all_neg_SSE2(v_w_row2_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row2_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 2]); | |
} | |
// Row 3 | |
if (!vec4i_all_neg_SSE2(v_w_row3_or)) { | |
const vec4i v_notPixelMask = vec4i_sar(v_w_row3_or, 31); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0, vec4f_madd_SSE2(v_dr12, v_l1, v_r2))); | |
const vec4i v_cg = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0, vec4f_madd_SSE2(v_dg12, v_l1, v_g2))); | |
const vec4i v_cb = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0, vec4f_madd_SSE2(v_db12, v_l1, v_b2))); | |
const vec4i v_ca = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0, vec4f_madd_SSE2(v_da12, v_l1, v_a2))); | |
const vec4i v_rgba8 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr, v_cg, v_cb, v_ca); | |
#endif | |
vec4i_toInt4va_maskedInv_SSE41(v_rgba8, v_notPixelMask, &fb_row[ctx->m_Width * 3]); | |
} | |
} else { | |
// Full block | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_w0_row1 = vec4i_add(v_w0_row0, v_edge0_dy); | |
const vec4i v_w1_row1 = vec4i_add(v_w1_row0, v_edge1_dy); | |
const vec4i v_w0_row2 = vec4i_add(v_w0_row1, v_edge0_dy); | |
const vec4i v_w1_row2 = vec4i_add(v_w1_row1, v_edge1_dy); | |
const vec4i v_w0_row3 = vec4i_add(v_w0_row2, v_edge0_dy); | |
const vec4i v_w1_row3 = vec4i_add(v_w1_row2, v_edge1_dy); | |
#endif | |
// Row 0 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row0 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row0 = vec4f_mul(vec4f_fromVec4i(v_w0_row0), v_inv_area); | |
const vec4f v_l1_row0 = vec4f_mul(vec4f_fromVec4i(v_w1_row0), v_inv_area); | |
const vec4i v_cr_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row0, vec4f_madd_SSE2(v_dr12, v_l1_row0, v_r2))); | |
const vec4i v_cg_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row0, vec4f_madd_SSE2(v_dg12, v_l1_row0, v_g2))); | |
const vec4i v_cb_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row0, vec4f_madd_SSE2(v_db12, v_l1_row0, v_b2))); | |
const vec4i v_ca_row0 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row0, vec4f_madd_SSE2(v_da12, v_l1_row0, v_a2))); | |
const vec4i v_rgba8_row0 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row0, v_cg_row0, v_cb_row0, v_ca_row0); | |
#endif | |
vec4i_toInt4va(v_rgba8_row0, &fb_row[0]); | |
} | |
// Row 1 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row1 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row1 = vec4f_mul(vec4f_fromVec4i(v_w0_row1), v_inv_area); | |
const vec4f v_l1_row1 = vec4f_mul(vec4f_fromVec4i(v_w1_row1), v_inv_area); | |
const vec4i v_cr_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row1, vec4f_madd_SSE2(v_dr12, v_l1_row1, v_r2))); | |
const vec4i v_cg_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row1, vec4f_madd_SSE2(v_dg12, v_l1_row1, v_g2))); | |
const vec4i v_cb_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row1, vec4f_madd_SSE2(v_db12, v_l1_row1, v_b2))); | |
const vec4i v_ca_row1 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row1, vec4f_madd_SSE2(v_da12, v_l1_row1, v_a2))); | |
const vec4i v_rgba8_row1 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row1, v_cg_row1, v_cb_row1, v_ca_row1); | |
#endif | |
vec4i_toInt4va(v_rgba8_row1, &fb_row[ctx->m_Width]); | |
} | |
// Row 2 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row2 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row2 = vec4f_mul(vec4f_fromVec4i(v_w0_row2), v_inv_area); | |
const vec4f v_l1_row2 = vec4f_mul(vec4f_fromVec4i(v_w1_row2), v_inv_area); | |
const vec4i v_cr_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row2, vec4f_madd_SSE2(v_dr12, v_l1_row2, v_r2))); | |
const vec4i v_cg_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row2, vec4f_madd_SSE2(v_dg12, v_l1_row2, v_g2))); | |
const vec4i v_cb_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row2, vec4f_madd_SSE2(v_db12, v_l1_row2, v_b2))); | |
const vec4i v_ca_row2 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row2, vec4f_madd_SSE2(v_da12, v_l1_row2, v_a2))); | |
const vec4i v_rgba8_row2 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row2, v_cg_row2, v_cb_row2, v_ca_row2); | |
#endif | |
vec4i_toInt4va(v_rgba8_row2, &fb_row[ctx->m_Width * 2]); | |
} | |
// Row 3 | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4i v_rgba8_row3 = vec4i_fromInt(-1); | |
#else | |
const vec4f v_l0_row3 = vec4f_mul(vec4f_fromVec4i(v_w0_row3), v_inv_area); | |
const vec4f v_l1_row3 = vec4f_mul(vec4f_fromVec4i(v_w1_row3), v_inv_area); | |
const vec4i v_cr_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dr02, v_l0_row3, vec4f_madd_SSE2(v_dr12, v_l1_row3, v_r2))); | |
const vec4i v_cg_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_dg02, v_l0_row3, vec4f_madd_SSE2(v_dg12, v_l1_row3, v_g2))); | |
const vec4i v_cb_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_db02, v_l0_row3, vec4f_madd_SSE2(v_db12, v_l1_row3, v_b2))); | |
const vec4i v_ca_row3 = vec4i_fromVec4f(vec4f_madd_SSE2(v_da02, v_l0_row3, vec4f_madd_SSE2(v_da12, v_l1_row3, v_a2))); | |
const vec4i v_rgba8_row3 = vec4i_packR32G32B32A32_to_RGBA8_SSSE3(v_cr_row3, v_cg_row3, v_cb_row3, v_ca_row3); | |
#endif | |
vec4i_toInt4va(v_rgba8_row3, &fb_row[ctx->m_Width * 3]); | |
} | |
} | |
} | |
v_w0_blockMin = vec4i_add(v_w0_blockMin, v_w0_nextBlock_dx); | |
v_w1_blockMin = vec4i_add(v_w1_blockMin, v_w1_nextBlock_dx); | |
v_w2_blockMin = vec4i_add(v_w2_blockMin, v_w2_nextBlock_dx); | |
} | |
v_w0_blockY = vec4i_add(v_w0_blockY, v_w0_nextBlock_dy); | |
v_w1_blockY = vec4i_add(v_w1_blockY, v_w1_nextBlock_dy); | |
v_w2_blockY = vec4i_add(v_w2_blockY, v_w2_nextBlock_dy); | |
} | |
} | |
// 2-level hierarchical rasterization using trivial reject/accept corners. | |
// | |
// Similar to swrDrawTriangleRef_HierarchicalLRB_Cond() but for each partially covered block | |
// the range of valid rows is calculated and iterated and for each touched row the range | |
// of valid pixels/cols is calculated and iterated. This way there is no need for conditionals | |
// inside the inner-most loop and the 3 barycentric coordinates are always greater than or equal | |
// to 0. | |
// | |
// Hard to vectorize because the valid row/col calculations are scalar. | |
// | |
// Even though there are multiple difficult to predict branches inside the inner loops this | |
// seems to be marginally faster than the reference _NoCond() function. | |
#if 1 | |
static void swrDrawTriangleRef_HierarchicalLRB_NoCond(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxMinX_aligned = swr_alignDown(bboxMinX, kBlockSize); | |
const int32_t bboxMinY_aligned = swr_alignDown(bboxMinY, kBlockSize); | |
const int32_t bboxMaxX_aligned = swr_alignUp(bboxMaxX, kBlockSize); | |
const int32_t bboxMaxY_aligned = swr_alignUp(bboxMaxY, kBlockSize); | |
const int32_t bboxWidth = bboxMaxX_aligned - bboxMinX_aligned; | |
const int32_t bboxHeight = bboxMaxY_aligned - bboxMinY_aligned; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const int32_t cr02 = (int32_t)c0r - (int32_t)c2r; | |
const int32_t cg02 = (int32_t)c0g - (int32_t)c2g; | |
const int32_t cb02 = (int32_t)c0b - (int32_t)c2b; | |
const int32_t ca02 = (int32_t)c0a - (int32_t)c2a; | |
const int32_t cr12 = (int32_t)c1r - (int32_t)c2r; | |
const int32_t cg12 = (int32_t)c1g - (int32_t)c2g; | |
const int32_t cb12 = (int32_t)c1b - (int32_t)c2b; | |
const int32_t ca12 = (int32_t)c1a - (int32_t)c2a; | |
#endif | |
// Triangle setup | |
const swr_edge edge0 = swr_edgeInit(x2, y2, x1, y1); | |
const swr_edge edge1 = swr_edgeInit(x0, y0, x2, y2); | |
const swr_edge edge2 = swr_edgeInit(x1, y1, x0, y0); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const float inv_area = 1.0f / (float)iarea; | |
#endif | |
// Trivial reject/accept corner offsets relative to block min/max. | |
const int32_t trivialRejectOffset0 = 0 | |
+ (edge0.m_dx >= 0 ? edge0.m_dx * (kBlockSize - 1) : 0) | |
+ (edge0.m_dy >= 0 ? edge0.m_dy * (kBlockSize - 1) : 0) | |
; | |
const int32_t trivialRejectOffset1 = 0 | |
+ (edge1.m_dx >= 0 ? edge1.m_dx * (kBlockSize - 1) : 0) | |
+ (edge1.m_dy >= 0 ? edge1.m_dy * (kBlockSize - 1) : 0) | |
; | |
const int32_t trivialRejectOffset2 = 0 | |
+ (edge2.m_dx >= 0 ? edge2.m_dx * (kBlockSize - 1) : 0) | |
+ (edge2.m_dy >= 0 ? edge2.m_dy * (kBlockSize - 1) : 0) | |
; | |
const int32_t trivialAcceptOffset0 = (edge0.m_dx + edge0.m_dy) * (kBlockSize - 1) - trivialRejectOffset0; | |
const int32_t trivialAcceptOffset1 = (edge1.m_dx + edge1.m_dy) * (kBlockSize - 1) - trivialRejectOffset1; | |
const int32_t trivialAcceptOffset2 = (edge2.m_dx + edge2.m_dy) * (kBlockSize - 1) - trivialRejectOffset2; | |
const int32_t trivialRejectOffset0_dx = 0 | |
+ (edge0.m_dx >= 0 ? edge0.m_dx * (kBlockSize - 1) : 0) | |
; | |
const int32_t trivialRejectOffset1_dx = 0 | |
+ (edge1.m_dx >= 0 ? edge1.m_dx * (kBlockSize - 1) : 0) | |
; | |
const int32_t trivialRejectOffset2_dx = 0 | |
+ (edge2.m_dx >= 0 ? edge2.m_dx * (kBlockSize - 1) : 0) | |
; | |
// Rasterize | |
const int32_t w0_bboxMin = swr_edgeEval(edge0, bboxMinX_aligned, bboxMinY_aligned); | |
const int32_t w1_bboxMin = swr_edgeEval(edge1, bboxMinX_aligned, bboxMinY_aligned); | |
const int32_t w2_bboxMin = swr_edgeEval(edge2, bboxMinX_aligned, bboxMinY_aligned); | |
const int32_t w0_block_dx = edge0.m_dx * kBlockSize; | |
const int32_t w0_block_dy = edge0.m_dy * kBlockSize; | |
const int32_t w1_block_dx = edge1.m_dx * kBlockSize; | |
const int32_t w1_block_dy = edge1.m_dy * kBlockSize; | |
const int32_t w2_block_dx = edge2.m_dx * kBlockSize; | |
const int32_t w2_block_dy = edge2.m_dy * kBlockSize; | |
int32_t w0_blockY = w0_bboxMin; | |
int32_t w1_blockY = w1_bboxMin; | |
int32_t w2_blockY = w2_bboxMin; | |
for (int32_t blockMinY = bboxMinY_aligned; blockMinY < bboxMaxY; blockMinY += kBlockSize) { | |
int32_t w0_blockMin = w0_blockY; | |
int32_t w1_blockMin = w1_blockY; | |
int32_t w2_blockMin = w2_blockY; | |
for (int32_t blockMinX = bboxMinX_aligned; blockMinX < bboxMaxX; blockMinX += kBlockSize) { | |
// Evaluate each edge function at its trivial reject corner (the most positive block corner). | |
// If the trivial rejct corner of any edge is negative (outside the edge) then the triangle | |
// does not touch the block. | |
const int32_t w0_trivialReject = w0_blockMin + trivialRejectOffset0; | |
const int32_t w1_trivialReject = w1_blockMin + trivialRejectOffset1; | |
const int32_t w2_trivialReject = w2_blockMin + trivialRejectOffset2; | |
if (SWR_ANY_NEGATIVE3(w0_trivialReject, w1_trivialReject, w2_trivialReject)) { | |
w0_blockMin += w0_block_dx; | |
w1_blockMin += w1_block_dx; | |
w2_blockMin += w2_block_dx; | |
continue; | |
} | |
// At this point we know that the triangle touches the tile. There are 2 cases: | |
// - The tile is fully covered by the triangle. | |
// - The tile is partially covered by the triangle. | |
// | |
// In the first case (fully covered tile) we can simply loop over all rows and fill them (fast path). | |
// In the second case (partially covered tile) we have to conditionally calculate the color of each pixel row. | |
// | |
// Evaluate each edge function at its trivial accept corner (the most negative block corner). | |
// The trivial accept corner is the opposite corner to the trivial reject corner. | |
// If all trivial accept corners are inside their respective edges then the block is fully | |
// covered by the triangle (1st case). Otherwise it's partially covered (2nd case). | |
// | |
// The trivial accept corner is calculated by subtracting the trivial reject corner offset from | |
// the block's max point. | |
// E.g. If the trivial reject corner ended up being (blockMinX, blockMaxY) it means that the offset | |
// was (0, kBlockSize - 1). Subtracting this offset from the block's max corner gives the opposite | |
// (trivial accept) corner: | |
// trivialAcceptCornerX = blockMaxX - 0 = blockMaxX | |
// trivialAcceptCornerY = blockMaxY - (kBlockSize - 1) = blockMinY + (kBlockSize - 1) - (kBlockSize - 1) = blockMinY | |
// | |
const int32_t w0_trivialAccept = w0_blockMin + trivialAcceptOffset0; | |
const int32_t w1_trivialAccept = w1_blockMin + trivialAcceptOffset1; | |
const int32_t w2_trivialAccept = w2_blockMin + trivialAcceptOffset2; | |
if (SWR_ANY_NEGATIVE3(w0_trivialAccept, w1_trivialAccept, w2_trivialAccept)) { | |
// Partial block | |
int32_t pymin = 0; | |
int32_t pymax = kBlockSize - 1; | |
if (w0_trivialAccept < 0) { | |
// Evaluate 1st edge function at the 4 block corners. If all of the signed | |
// distances are negative (all sign bits are 1) then the block will be empty. | |
const int32_t w0_A = w0_blockMin; | |
const int32_t w0_B = w0_blockMin + w0_block_dx; | |
const int32_t w0_C = w0_blockMin + w0_block_dx + w0_block_dy; | |
const int32_t w0_D = w0_blockMin + w0_block_dy; | |
const uint32_t w0_blockMsk = SWR_BLOCK_MASK(w0_A, w0_B, w0_C, w0_D); | |
assert(SWR_BLOCK_MASK_IS_VALID(w0_blockMsk)); | |
assert(w0_blockMsk != SWR_BLOCK_MASK_EMPTY); | |
if (SWR_BLOCK_MASK_Y_MAX(w0_blockMsk)) { | |
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy)); | |
pymax = swr_mini(pymax, w_pymax); | |
} else if (SWR_BLOCK_MASK_Y_MIN(w0_blockMsk)) { | |
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w0_A, w0_B)), edge0.m_dy)); | |
pymin = swr_maxi(pymin, w_pymin); | |
} | |
} | |
if (w1_trivialAccept < 0) { | |
// Evaluate 2nd edge function at the 4 block corners. If all of the signed | |
// distances are negative (all sign bits are 1) then the block will be empty. | |
const int32_t w1_A = w1_blockMin; | |
const int32_t w1_B = w1_blockMin + w1_block_dx; | |
const int32_t w1_C = w1_blockMin + w1_block_dx + w1_block_dy; | |
const int32_t w1_D = w1_blockMin + w1_block_dy; | |
const uint32_t w1_blockMsk = SWR_BLOCK_MASK(w1_A, w1_B, w1_C, w1_D); | |
assert(SWR_BLOCK_MASK_IS_VALID(w1_blockMsk)); | |
assert(w1_blockMsk != SWR_BLOCK_MASK_EMPTY); | |
if (SWR_BLOCK_MASK_Y_MAX(w1_blockMsk)) { | |
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy)); | |
pymax = swr_mini(pymax, w_pymax); | |
} else if (SWR_BLOCK_MASK_Y_MIN(w1_blockMsk)) { | |
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w1_A, w1_B)), edge1.m_dy)); | |
pymin = swr_maxi(pymin, w_pymin); | |
} | |
} | |
if (w2_trivialAccept < 0) { | |
// Evaluate 3rd edge function at the 4 block corners. If all of the signed | |
// distances are negative (all sign bits are 1) then the block will be empty. | |
const int32_t w2_A = w2_blockMin; | |
const int32_t w2_B = w2_blockMin + w2_block_dx; | |
const int32_t w2_C = w2_blockMin + w2_block_dx + w2_block_dy; | |
const int32_t w2_D = w2_blockMin + w2_block_dy; | |
const uint32_t w2_blockMsk = SWR_BLOCK_MASK(w2_A, w2_B, w2_C, w2_D); | |
assert(SWR_BLOCK_MASK_IS_VALID(w2_blockMsk)); | |
assert(w2_blockMsk != SWR_BLOCK_MASK_EMPTY); | |
if (SWR_BLOCK_MASK_Y_MAX(w2_blockMsk)) { | |
const int32_t w_pymax = swr_absi(swr_idiv_floor(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy)); | |
pymax = swr_mini(pymax, w_pymax); | |
} else if (SWR_BLOCK_MASK_Y_MIN(w2_blockMsk)) { | |
const int32_t w_pymin = swr_absi(swr_idiv_ceil(swr_absi(swr_maxi(w2_A, w2_B)), edge2.m_dy)); | |
pymin = swr_maxi(pymin, w_pymin); | |
} | |
} | |
// Evaluate edge functions at the first row. | |
int32_t w0_blockMinX_py = w0_blockMin + edge0.m_dy * pymin; | |
int32_t w1_blockMinX_py = w1_blockMin + edge1.m_dy * pymin; | |
int32_t w2_blockMinX_py = w2_blockMin + edge2.m_dy * pymin; | |
for (int32_t py = pymin; py <= pymax; ++py) { | |
const int32_t w0_rowMsk = SWR_ROW_MASK(w0_blockMinX_py, w0_blockMinX_py + w0_block_dx); | |
const int32_t w1_rowMsk = SWR_ROW_MASK(w1_blockMinX_py, w1_blockMinX_py + w1_block_dx); | |
const int32_t w2_rowMsk = SWR_ROW_MASK(w2_blockMinX_py, w2_blockMinX_py + w2_block_dx); | |
assert(w0_rowMsk != SWR_ROW_MASK_EMPTY); | |
assert(w1_rowMsk != SWR_ROW_MASK_EMPTY); | |
assert(w2_rowMsk != SWR_ROW_MASK_EMPTY); | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + (blockMinY + py) * ctx->m_Width]; | |
int32_t pxmin = 0; | |
int32_t pxmax = (int32_t)kBlockSize - 1; | |
if (!SWR_ROW_MASK_ALL_FULL(w0_rowMsk, w1_rowMsk, w2_rowMsk)) { | |
if (SWR_ROW_MASK_X_MAX(w0_rowMsk)) { | |
const int32_t w_pxmax = swr_idiv_floor(w0_blockMinX_py, -edge0.m_dx); | |
pxmax = swr_mini(pxmax, w_pxmax); | |
} else if (SWR_ROW_MASK_X_MIN(w0_rowMsk)) { | |
const int32_t w_pxmin = swr_idiv_ceil(-w0_blockMinX_py, edge0.m_dx); | |
pxmin = swr_maxi(pxmin, w_pxmin); | |
} | |
if (SWR_ROW_MASK_X_MAX(w1_rowMsk)) { | |
const int32_t w_pxmax = swr_idiv_floor(w1_blockMinX_py, -edge1.m_dx); | |
pxmax = swr_mini(pxmax, w_pxmax); | |
} else if (SWR_ROW_MASK_X_MIN(w1_rowMsk)) { | |
const int32_t w_pxmin = swr_idiv_ceil(-w1_blockMinX_py, edge1.m_dx); | |
pxmin = swr_maxi(pxmin, w_pxmin); | |
} | |
if (SWR_ROW_MASK_X_MAX(w2_rowMsk)) { | |
const int32_t w_pxmax = swr_idiv_floor(w2_blockMinX_py, -edge2.m_dx); | |
pxmax = swr_mini(pxmax, w_pxmax); | |
} else if (SWR_ROW_MASK_X_MIN(w2_rowMsk)) { | |
const int32_t w_pxmin = swr_idiv_ceil(-w2_blockMinX_py, edge2.m_dx); | |
pxmin = swr_maxi(pxmin, w_pxmin); | |
} | |
} | |
// Calculate barycentric coords at pxmin | |
int32_t w0 = w0_blockMinX_py + pxmin * edge0.m_dx; | |
int32_t w1 = w1_blockMinX_py + pxmin * edge1.m_dx; | |
int32_t w2 = w2_blockMinX_py + pxmin * edge2.m_dx; | |
for (int32_t px = pxmin; px <= pxmax; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
w0_blockMinX_py += edge0.m_dy; | |
w1_blockMinX_py += edge1.m_dy; | |
w2_blockMinX_py += edge2.m_dy; | |
} | |
} else { | |
// Full block | |
int32_t w0_row = w0_blockMin; | |
int32_t w1_row = w1_blockMin; | |
int32_t w2_row = w2_blockMin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[blockMinX + blockMinY * ctx->m_Width]; | |
for (int32_t py = 0; py < (int32_t)kBlockSize; ++py) { | |
// Calculate barycentric coords at pxmin | |
int32_t w0 = w0_row; | |
int32_t w1 = w1_row; | |
int32_t w2 = w2_row; | |
for (int32_t px = 0; px < (int32_t)kBlockSize; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
assert(w0 >= 0 && w1 >= 0 && w2 >= 0); | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
const float l0 = (float)w0 * inv_area; | |
const float l1 = (float)w1 * inv_area; | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const uint32_t cr = (uint32_t)(cr02 * l0 + cr12 * l1 + c2r); | |
const uint32_t cg = (uint32_t)(cg02 * l0 + cg12 * l1 + c2g); | |
const uint32_t cb = (uint32_t)(cb02 * l0 + cb12 * l1 + c2b); | |
const uint32_t ca = (uint32_t)(ca02 * l0 + ca12 * l1 + c2a); | |
const uint32_t rgba = SWR_COLOR(cr, cg, cb, ca); | |
#endif | |
fb_row[px] = rgba; | |
} | |
w0 += edge0.m_dx; | |
w1 += edge1.m_dx; | |
w2 += edge2.m_dx; | |
} | |
w0_row += edge0.m_dy; | |
w1_row += edge1.m_dy; | |
w2_row += edge2.m_dy; | |
fb_row += ctx->m_Width; | |
} | |
} | |
w0_blockMin += w0_block_dx; | |
w1_blockMin += w1_block_dx; | |
w2_blockMin += w2_block_dx; | |
} | |
w0_blockY += w0_block_dy; | |
w1_blockY += w1_block_dy; | |
w2_blockY += w2_block_dy; | |
} | |
} | |
#endif | |
// Initial SSE2 implementation based on swrDrawTriangle_Ref() | |
#if 1 | |
static void swrDrawTriangleSSE2_Ref(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
// Make sure the triangle is CCW. If it's not swap points 1 and 2 to make it CCW. | |
int32_t iarea = (x0 - x2) * (y1 - y0) - (x1 - x0) * (y0 - y2); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
// Compute triangle bounding box | |
const int32_t minX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t minY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t maxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)(ctx->m_Width - 1)); | |
const int32_t maxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)(ctx->m_Height - 1)); | |
const int32_t bboxWidth = maxX - minX; | |
const int32_t bboxHeight = maxY - minY; | |
// Prepare interpolated attributes | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_c0 = vec4f_fromRGBA8(color0); | |
const vec4f v_c1 = vec4f_fromRGBA8(color1); | |
const vec4f v_c2 = vec4f_fromRGBA8(color2); | |
const vec4f v_c02 = vec4f_sub(v_c0, v_c2); | |
const vec4f v_c12 = vec4f_sub(v_c1, v_c2); | |
#endif | |
// Triangle setup | |
const vec4i v_w_px = vec4i_fromInt4(y1 - y2, y2 - y0, y0 - y1, 0); | |
const vec4i v_w_py = vec4i_fromInt4(x2 - x1, x0 - x2, x1 - x0, 0); | |
const vec4i v_w_c = vec4i_fromInt4(x1 * y2 - y1 * x2, x2 * y0 - y2 * x0, x0 * y1 - y0 * x1, 0); | |
const vec4i v_minX = vec4i_fromInt(minX); | |
const vec4i v_minY = vec4i_fromInt(minY); | |
const vec4i v_w_pmin = vec4i_add(v_w_c, vec4i_add(vec4i_mullo_SSE2(v_w_px, v_minX), vec4i_mullo_SSE2(v_w_py, v_minY))); | |
// Barycentric coordinate normalization | |
#if !SWR_CONFIG_NO_PIXEL_SHADER | |
const vec4f v_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
#endif | |
// Rasterize | |
vec4i v_w_row = v_w_pmin; | |
uint32_t* fb_row = &ctx->m_FrameBuffer[minX + minY * ctx->m_Width]; | |
for (int32_t py = 0; py <= bboxHeight; ++py) { | |
int32_t pxmin = 0; | |
int32_t pxmax = bboxWidth; | |
// Calculate the range of x values for which the barycentric coordinates | |
// will always be greater than or equal to 0. | |
{ | |
int32_t w_row[4]; | |
vec4i_toInt4vu(v_w_row, &w_row[0]); | |
int32_t w_px[4]; | |
vec4i_toInt4vu(v_w_px, &w_px[0]); | |
// The barycentric coordinates are linear functions: w_pmin + i * w_px | |
// | |
// The inequality w_pmin + i * w_px >= 0 holds for all i's in the range: | |
// 1. w_pmin >= 0 && w_px >= 0 : [0, bboxWidth] | |
// 2. w_pmin >= 0 && w_px < 0 : [0, imax] where imax = -(w_pmin / w_px) | |
// 3. w_pmin < 0 && w_px > 0 : [imin, bboxWidth] where imin = -(w_pmin / w_px) + 1 | |
// 4. w_pmin < 0 && w_px <= 0 : never | |
// | |
// From the 3 barycentric coordinates we have 3 equations. All of them | |
// should be greater than or equal to 0 to draw a pixel. | |
// Make sure we aren't in an invalid state. | |
assert(!(w_row[0] < 0 && w_px[0] <= 0)); | |
assert(!(w_row[1] < 0 && w_px[1] <= 0)); | |
assert(!(w_row[2] < 0 && w_px[2] <= 0)); | |
// Calculate x range based on w0... | |
if (w_row[0] >= 0 && w_px[0] < 0) { | |
pxmax = swr_mini(pxmax, -(w_row[0] / w_px[0])); | |
} else if (w_row[0] < 0 && w_px[0] > 0) { | |
pxmin = swr_maxi(pxmin, -(w_row[0] / w_px[0]) + 1); | |
} | |
// Calculate x range based on w1... | |
if (w_row[1] >= 0 && w_px[1] < 0) { | |
pxmax = swr_mini(pxmax, -(w_row[1] / w_px[1])); | |
} else if (w_row[1] < 0 && w_px[1] > 0) { | |
pxmin = swr_maxi(pxmin, -(w_row[1] / w_px[1]) + 1); | |
} | |
// Calculate x range based on w2... | |
if (w_row[2] >= 0 && w_px[2] < 0) { | |
pxmax = swr_mini(pxmax, -(w_row[2] / w_px[2])); | |
} else if (w_row[2] < 0 && w_px[2] > 0) { | |
pxmin = swr_maxi(pxmin, -(w_row[2] / w_px[2]) + 1); | |
} | |
} | |
// Calculate barycentric coords at pxmin | |
const vec4i v_pxmin = vec4i_fromInt(pxmin); | |
vec4i v_w = vec4i_add(v_w_row, vec4i_mullo_SSE2(v_w_px, v_pxmin)); | |
for (int32_t px = pxmin; px <= pxmax; ++px) { | |
// (px, py) is guaranteed to be inside the triangle (or on one of the edges) | |
// Render the pixel | |
{ | |
#if SWR_CONFIG_NO_PIXEL_SHADER | |
const uint32_t rgba = 0xFFFFFFFF; | |
#else | |
int32_t w[4]; | |
vec4i_toInt4vu(v_w, &w[0]); | |
assert(w[0] >= 0 && w[1] >= 0 && w[2] >= 0); | |
const vec4f v_l = vec4f_mul(vec4f_fromVec4i(v_w), v_inv_area); | |
const vec4f v_l0 = vec4f_getXXXX(v_l); | |
const vec4f v_l1 = vec4f_getYYYY(v_l); | |
// l2 = 1.0f - (l0 + l1) | |
// | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * l2 <=> | |
// attr = attr0 * l0 + attr1 * l1 + attr2 * (1.0 - l0 - l1) <=> | |
// attr = (attr0 - attr2) * l0 + (attr1 - attr2) * l1 + attr2 <=> | |
// attr = dattr02 * l0 + dattr12 * l1 + attr2 <=> | |
// | |
// attr = fmad(dattr02, l0, fmad(dattr12, l1, attr2)); | |
const vec4f v_c = vec4f_madd_SSE2(v_c02, v_l0, vec4f_madd_SSE2(v_c12, v_l1, v_c2)); | |
const uint32_t rgba = vec4f_toRGBA8(v_c); | |
#endif | |
fb_row[px] = rgba; | |
} | |
v_w = vec4i_add(v_w, v_w_px); | |
} | |
v_w_row = vec4i_add(v_w_row, v_w_py); | |
fb_row += ctx->m_Width; | |
} | |
} | |
#endif | |
#if 0 | |
// Old implementations (see Triangle Rasterizations posts) | |
////////////////////////////////////////////////////////////////////////// | |
// SSE2 implementation | |
// | |
#define USE_VEC4_LIB 0 | |
// http://dss.stephanierct.com/DevBlog/?p=8 | |
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
static inline __m128 _mm_floor_ps_SSE2(__m128 x) | |
{ | |
__m128 j = _mm_load_ps(&xmm_ones[0]); | |
__m128i i = _mm_cvttps_epi32(x); | |
__m128 fi = _mm_cvtepi32_ps(i); | |
__m128 igx = _mm_cmpgt_ps(fi, x); | |
j = _mm_and_ps(igx, j); | |
return _mm_sub_ps(fi, j); | |
} | |
static inline __m128 _mm_ceil_ps_SSE2(__m128 x) | |
{ | |
__m128 j = _mm_load_ps(&xmm_ones[0]); | |
__m128i i = _mm_cvttps_epi32(x); | |
__m128 fi = _mm_cvtepi32_ps(i); | |
__m128 igx = _mm_cmplt_ps(fi, x); | |
j = _mm_and_ps(igx, j); | |
return _mm_add_ps(fi, j); | |
} | |
static inline __m128i _mm_mullo_epi32_SSE2(__m128i a, __m128i b) | |
{ | |
__m128i tmp1 = _mm_mul_epu32(a, b); | |
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); | |
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); | |
} | |
#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8)) | |
static void swrDrawTriangleSSE2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
#if USE_VEC4_LIB | |
const vec4f v4f_rgba0 = vec4f_fromRGBA8(color0); | |
const vec4f v4f_rgba1 = vec4f_fromRGBA8(color1); | |
const vec4f v4f_rgba2 = vec4f_fromRGBA8(color2); | |
const vec4f v4f_drgba20 = vec4f_sub(v4f_rgba2, v4f_rgba0); | |
const vec4f v4f_drgba10 = vec4f_sub(v4f_rgba1, v4f_rgba0); | |
const vec4f v4f_r0 = vec4f_getXXXX(v4f_rgba0); | |
const vec4f v4f_g0 = vec4f_getYYYY(v4f_rgba0); | |
const vec4f v4f_b0 = vec4f_getZZZZ(v4f_rgba0); | |
const vec4f v4f_a0 = vec4f_getWWWW(v4f_rgba0); | |
const vec4f v4f_dr20 = vec4f_getXXXX(v4f_drgba20); | |
const vec4f v4f_dg20 = vec4f_getYYYY(v4f_drgba20); | |
const vec4f v4f_db20 = vec4f_getZZZZ(v4f_drgba20); | |
const vec4f v4f_da20 = vec4f_getWWWW(v4f_drgba20); | |
const vec4f v4f_dr10 = vec4f_getXXXX(v4f_drgba10); | |
const vec4f v4f_dg10 = vec4f_getYYYY(v4f_drgba10); | |
const vec4f v4f_db10 = vec4f_getZZZZ(v4f_drgba10); | |
const vec4f v4f_da10 = vec4f_getWWWW(v4f_drgba10); | |
const vec4f v4f_inv_area = vec4f_fromFloat(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const vec4i v4i_x_duvw_ = vec4i_fromInt4(-dy01, -dy20, dy01_dy20, 0); | |
const vec4f v4f_x_duvw_1 = vec4f_mul(vec4f_fromVec4i(v4i_x_duvw_), v4f_inv_area); | |
const vec4f v4f_x_duvw_2 = vec4f_add(v4f_x_duvw_1, v4f_x_duvw_1); | |
const vec4f v4f_x_duvw_3 = vec4f_add(v4f_x_duvw_1, v4f_x_duvw_2); | |
const vec4f v4f_x_duvw_4 = vec4f_add(v4f_x_duvw_2, v4f_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const vec4f v4f_x_duv0_duv1 = vec4f_shuffle(vec4f_zero(), v4f_x_duvw_1, VEC4F_SHUFFLE_XYXY); | |
// UV deltas for the 3rd and 4th pixel | |
const vec4f v4f_x_duv2_duv3 = vec4f_shuffle(v4f_x_duvw_2, v4f_x_duvw_3, VEC4F_SHUFFLE_XYXY); | |
const vec4f v4f_x_du4 = vec4f_getXXXX(v4f_x_duvw_4); | |
const vec4f v4f_x_dv4 = vec4f_getYYYY(v4f_x_duvw_4); | |
// Barycentric coordinate deltas for the Y direction | |
const vec4i v4i_y_duvw_ = vec4i_fromInt4(dx01, dx20, -(dx01 + dx20), 0); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
vec4i v4i_row_uvw_ = vec4i_fromInt4(bboxMin_u, bboxMin_v, bboxMin_w, 0); | |
// | |
const vec4f v4f_row_uvw_scale = vec4f_fromFloat4(1.0f / (float)dy01, 1.0f / (float)dy20, 1.0f / (float)dy01_dy20, 0.0f); | |
#else | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0); | |
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0); | |
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
#endif | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
#if USE_VEC4_LIB | |
vec4i_toInt4vu(v4i_row_uvw_, &row_uvw_[0]); | |
const vec4f v4f_row_uvw_ = vec4f_mul(vec4f_fromVec4i(v4i_row_uvw_), v4f_row_uvw_scale); | |
const vec4i v4i_row_uvw_floor = vec4i_fromVec4f(vec4f_floor_SSE2(v4f_row_uvw_)); | |
const vec4i v4i_row_uvw_ceil = vec4i_fromVec4f(vec4f_ceil_SSE2(v4f_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
vec4i_toInt4vu(v4i_row_uvw_floor, &row_uvw_floor[0]); | |
int32_t row_uvw_ceil[4]; | |
vec4i_toInt4vu(v4i_row_uvw_ceil, &row_uvw_ceil[0]); | |
#else | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
#endif | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
#if USE_VEC4_LIB | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const vec4i v4i_p0uvw_ = vec4i_add(v4i_row_uvw_, vec4i_mullo_SSE2(vec4i_fromInt(ixmin), v4i_x_duvw_)); | |
const vec4f v4f_p0uvw_ = vec4f_mul(vec4f_fromVec4i(v4i_p0uvw_), v4f_inv_area); | |
const vec4f v4f_p0uvuv = vec4f_getXYXY(v4f_p0uvw_); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const vec4f v4f_p0uv_p1uv = vec4f_add(v4f_p0uvuv, v4f_x_duv0_duv1); | |
const vec4f v4f_p2uv_p3uv = vec4f_add(v4f_p0uvuv, v4f_x_duv2_duv3); | |
// Extract barycentric coordinates for each pixel | |
vec4f v4f_u0123 = vec4f_shuffle(v4f_p0uv_p1uv, v4f_p2uv_p3uv, VEC4F_SHUFFLE_XZXZ); | |
vec4f v4f_v0123 = vec4f_shuffle(v4f_p0uv_p1uv, v4f_p2uv_p3uv, VEC4F_SHUFFLE_YWYW); | |
#else | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1)); | |
#endif | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
#if USE_VEC4_LIB | |
const vec4f v4f_r_p0123 = vec4f_add(v4f_r0, vec4f_add(vec4f_mul(v4f_dr10, v4f_v0123), vec4f_mul(v4f_dr20, v4f_u0123))); | |
const vec4f v4f_g_p0123 = vec4f_add(v4f_g0, vec4f_add(vec4f_mul(v4f_dg10, v4f_v0123), vec4f_mul(v4f_dg20, v4f_u0123))); | |
const vec4f v4f_b_p0123 = vec4f_add(v4f_b0, vec4f_add(vec4f_mul(v4f_db10, v4f_v0123), vec4f_mul(v4f_db20, v4f_u0123))); | |
const vec4f v4f_a_p0123 = vec4f_add(v4f_a0, vec4f_add(vec4f_mul(v4f_da10, v4f_v0123), vec4f_mul(v4f_da20, v4f_u0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(v4f_r_p0123.m_XMM), _mm_cvtps_epi32(v4f_g_p0123.m_XMM)), | |
_mm_packs_epi32(_mm_cvtps_epi32(v4f_b_p0123.m_XMM), _mm_cvtps_epi32(v4f_a_p0123.m_XMM)) | |
); | |
#else | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
#endif | |
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register | |
// _mm_shuffle_epi8() with SSE2 | |
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF); | |
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 } | |
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask), | |
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8) | |
); | |
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 } | |
const __m128i imm_rgba_p0123_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask), | |
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8) | |
); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
#if USE_VEC4_LIB | |
v4f_u0123 = vec4f_add(v4f_u0123, v4f_x_du4); | |
v4f_v0123 = vec4f_add(v4f_u0123, v4f_x_dv4); | |
#else | |
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4); | |
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4); | |
#endif | |
frameBuffer += 4; | |
} | |
// Calculate the colors of the 4 next pixels and selectively store only the number | |
// of remainder pixels for this row | |
const uint32_t rem = numPixels & 3; | |
{ | |
#if USE_VEC4_LIB | |
const vec4f v4f_r_p0123 = vec4f_madd_SSE2(v4f_dr10, v4f_v0123, vec4f_madd_SSE2(v4f_dr20, v4f_u0123, v4f_r0)); | |
const vec4f v4f_g_p0123 = vec4f_madd_SSE2(v4f_dg10, v4f_v0123, vec4f_madd_SSE2(v4f_dg20, v4f_u0123, v4f_g0)); | |
const vec4f v4f_b_p0123 = vec4f_madd_SSE2(v4f_db10, v4f_v0123, vec4f_madd_SSE2(v4f_db20, v4f_u0123, v4f_b0)); | |
const vec4f v4f_a_p0123 = vec4f_madd_SSE2(v4f_da10, v4f_v0123, vec4f_madd_SSE2(v4f_da20, v4f_u0123, v4f_a0)); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(v4f_r_p0123.m_XMM), _mm_cvtps_epi32(v4f_g_p0123.m_XMM)), | |
_mm_packs_epi32(_mm_cvtps_epi32(v4f_b_p0123.m_XMM), _mm_cvtps_epi32(v4f_a_p0123.m_XMM)) | |
); | |
#else | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
#endif | |
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register | |
// _mm_shuffle_epi8() with SSE2 | |
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF); | |
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 } | |
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask), | |
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8) | |
); | |
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 } | |
const __m128i imm_rgba_p0123_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask), | |
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8) | |
); | |
switch (rem) { | |
case 1: | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0123_u8); | |
break; | |
case 2: | |
_mm_storeu_si64(frameBuffer, imm_rgba_p0123_u8); | |
break; | |
case 3: | |
_mm_storeu_si64(frameBuffer, imm_rgba_p0123_u8); | |
_mm_storeu_si32(&frameBuffer[2], _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(2, 2, 2, 2))); | |
break; | |
case 0: | |
default: | |
break; | |
} | |
} | |
} | |
#if USE_VEC4_LIB | |
// Move on to the next row of pixels. | |
v4i_row_uvw_ = vec4i_add(v4i_row_uvw_, v4i_y_duvw_); | |
#else | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
#endif | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
static void swrDrawTriangleSSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0); | |
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0); | |
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3)); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// Shuffle into RGBA uint32_t | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4); | |
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Calculate the colors of the 4 next pixels and selectively store only the number | |
// of remainder pixels for this row | |
const uint32_t rem = numPixels & 3; | |
if (rem != 0) { | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// Shuffle into RGBA uint32_t | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
// Store | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0123_u8); | |
frameBuffer++; | |
if (rem == 2) { | |
_mm_storeu_si32(frameBuffer, _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(1, 1, 1, 1))); | |
} else if (rem == 3) { | |
_mm_storeu_si64(frameBuffer, _mm_shuffle_si128(imm_rgba_p0123_u8, imm_rgba_p0123_u8, _MM_SHUFFLE(2, 1, 2, 1))); | |
} | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
static void swrDrawTriangleSSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0); | |
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0); | |
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3)); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// Shuffle into RGBA uint32_t | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4); | |
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Calculate the colors of the 4 next pixels and selectively store only the number | |
// of remainder pixels for this row | |
const uint32_t rem = numPixels & 3; | |
{ | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// Shuffle into RGBA uint32_t | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
// Load existing frame buffer values. | |
const __m128i imm_frameBuffer = _mm_lddqu_si128((const __m128i*)frameBuffer); | |
// Replace only the number of remainder pixels | |
const __m128 blendMask = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_set_epi32(rem, rem, rem, rem), _mm_set_epi32(3, 2, 1, 0))); | |
const __m128i xmm_newFrameBuffer = _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(imm_frameBuffer), _mm_castsi128_ps(imm_rgba_p0123_u8), blendMask)); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, xmm_newFrameBuffer); | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef SWR_SWR_MATH_H | |
#define SWR_SWR_MATH_H | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <immintrin.h> | |
typedef struct vec4f | |
{ | |
__m128 m_XMM; | |
} vec4f; | |
typedef struct vec4i | |
{ | |
__m128i m_IMM; | |
} vec4i; | |
static inline vec4f vec4f_zero(void) | |
{ | |
return (vec4f){ .m_XMM = _mm_setzero_ps() }; | |
} | |
static inline vec4f vec4f_fromFloat(float x) | |
{ | |
return (vec4f){ .m_XMM = _mm_set_ps1(x) }; | |
} | |
static inline vec4f vec4f_fromVec4i(vec4i x) | |
{ | |
return (vec4f){ .m_XMM = _mm_cvtepi32_ps(x.m_IMM) }; | |
} | |
static inline vec4f vec4f_fromFloat4(float x0, float x1, float x2, float x3) | |
{ | |
return (vec4f){ .m_XMM = _mm_set_ps(x3, x2, x1, x0) }; | |
} | |
static inline vec4f vec4f_fromRGBA8(uint32_t rgba8) | |
{ | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128i imm_rgba8 = _mm_cvtsi32_si128(rgba8); | |
const __m128i imm_rgba16 = _mm_unpacklo_epi8(imm_rgba8, imm_zero); | |
const __m128i imm_rgba32 = _mm_unpacklo_epi16(imm_rgba16, imm_zero); | |
return (vec4f){ | |
.m_XMM = _mm_cvtepi32_ps(imm_rgba32) | |
}; | |
} | |
static inline uint32_t vec4f_toRGBA8(vec4f x) | |
{ | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128i imm_rgba32 = _mm_cvtps_epi32(x.m_XMM); | |
const __m128i imm_rgba16 = _mm_packs_epi32(imm_rgba32, imm_zero); | |
const __m128i imm_rgba8 = _mm_packus_epi16(imm_rgba16, imm_zero); | |
return (uint32_t)_mm_cvtsi128_si32(imm_rgba8); | |
} | |
static inline vec4f vec4f_add(vec4f a, vec4f b) | |
{ | |
return (vec4f){ .m_XMM = _mm_add_ps(a.m_XMM, b.m_XMM) }; | |
} | |
static inline vec4f vec4f_sub(vec4f a, vec4f b) | |
{ | |
return (vec4f){ .m_XMM = _mm_sub_ps(a.m_XMM, b.m_XMM) }; | |
} | |
static inline vec4f vec4f_mul(vec4f a, vec4f b) | |
{ | |
return (vec4f){ .m_XMM = _mm_mul_ps(a.m_XMM, b.m_XMM) }; | |
} | |
#define VEC4_SHUFFLE_MASK(d0_a, d1_a, d2_b, d3_b) (((d3_b) << 6) | ((d2_b) << 4) | ((d1_a) << 2) | ((d0_a))) | |
typedef enum vec4_shuffle_mask | |
{ | |
VEC4_SHUFFLE_XXXX = VEC4_SHUFFLE_MASK(0, 0, 0, 0), | |
VEC4_SHUFFLE_YYYY = VEC4_SHUFFLE_MASK(1, 1, 1, 1), | |
VEC4_SHUFFLE_ZZZZ = VEC4_SHUFFLE_MASK(2, 2, 2, 2), | |
VEC4_SHUFFLE_WWWW = VEC4_SHUFFLE_MASK(3, 3, 3, 3), | |
VEC4_SHUFFLE_XYXY = VEC4_SHUFFLE_MASK(0, 1, 0, 1), | |
VEC4_SHUFFLE_XZXZ = VEC4_SHUFFLE_MASK(0, 2, 0, 2), | |
VEC4_SHUFFLE_YWYW = VEC4_SHUFFLE_MASK(1, 3, 1, 3), | |
VEC4_SHUFFLE_ZWZW = VEC4_SHUFFLE_MASK(2, 3, 2, 3), | |
} vec4_shuffle_mask; | |
#define VEC4F_GET_FUNC(swizzle) \ | |
static inline vec4f vec4f_get##swizzle(vec4f x) \ | |
{ \ | |
return (vec4f){ .m_XMM = _mm_shuffle_ps(x.m_XMM, x.m_XMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \ | |
} | |
VEC4F_GET_FUNC(XXXX); | |
VEC4F_GET_FUNC(YYYY); | |
VEC4F_GET_FUNC(ZZZZ); | |
VEC4F_GET_FUNC(WWWW); | |
VEC4F_GET_FUNC(XYXY); | |
VEC4F_GET_FUNC(ZWZW); | |
// Function-like macro because mask must be an immediate (constant) | |
#define vec4f_shuffle(a, b, mask) (vec4f){ .m_XMM = _mm_shuffle_ps(a.m_XMM, b.m_XMM, mask) } | |
// http://dss.stephanierct.com/DevBlog/?p=8 | |
static inline vec4f vec4f_floor_SSE2(vec4f x) | |
{ | |
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
const __m128i i = _mm_cvttps_epi32(x.m_XMM); | |
const __m128 fi = _mm_cvtepi32_ps(i); | |
const __m128 igx = _mm_cmpgt_ps(fi, x.m_XMM); | |
const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0])); | |
return (vec4f){ .m_XMM = _mm_sub_ps(fi, j) }; | |
} | |
// http://dss.stephanierct.com/DevBlog/?p=8 | |
static inline vec4f vec4f_ceil_SSE2(vec4f x) | |
{ | |
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
const __m128i i = _mm_cvttps_epi32(x.m_XMM); | |
const __m128 fi = _mm_cvtepi32_ps(i); | |
const __m128 igx = _mm_cmplt_ps(fi, x.m_XMM); | |
const __m128 j = _mm_and_ps(igx, _mm_load_ps(&xmm_ones[0])); | |
return (vec4f){ .m_XMM = _mm_add_ps(fi, j) }; | |
} | |
static inline vec4f vec4f_floor_SSE41(vec4f x) | |
{ | |
return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_FLOOR) }; | |
} | |
static inline vec4f vec4f_ceil_SSE41(vec4f x) | |
{ | |
return (vec4f){ .m_XMM = _mm_round_ps(x.m_XMM, _MM_FROUND_CEIL) }; | |
} | |
static inline vec4f vec4f_madd_SSE2(vec4f a, vec4f b, vec4f c) | |
{ | |
return (vec4f){ .m_XMM = _mm_add_ps(c.m_XMM, _mm_mul_ps(a.m_XMM, b.m_XMM)) }; | |
} | |
static inline vec4i vec4i_zero(void) | |
{ | |
return (vec4i){ .m_IMM = _mm_setzero_si128()}; | |
} | |
static inline vec4i vec4i_one(void) | |
{ | |
return (vec4i){ .m_IMM = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()) }; | |
} | |
static inline vec4i vec4i_fromInt(int32_t x) | |
{ | |
return (vec4i){ .m_IMM = _mm_set1_epi32(x) }; | |
} | |
static inline vec4i vec4i_fromVec4f(vec4f x) | |
{ | |
return (vec4i){ .m_IMM = _mm_cvtps_epi32(x.m_XMM) }; | |
} | |
static inline vec4i vec4i_fromInt4(int32_t x0, int32_t x1, int32_t x2, int32_t x3) | |
{ | |
return (vec4i){ .m_IMM = _mm_set_epi32(x3, x2, x1, x0) }; | |
} | |
static inline vec4i vec4i_fromInt4va(const int32_t* arr) | |
{ | |
return (vec4i){ .m_IMM = _mm_load_si128((const __m128i*)arr) }; | |
} | |
static inline void vec4i_toInt4vu(vec4i x, int32_t* arr) | |
{ | |
_mm_storeu_si128((__m128i*)arr, x.m_IMM); | |
} | |
static inline void vec4i_toInt4va(vec4i x, int32_t* arr) | |
{ | |
_mm_store_si128((__m128i*)arr, x.m_IMM); | |
} | |
static inline void vec4i_toInt4va_masked(vec4i x, vec4i mask, int32_t* buffer) | |
{ | |
#if 0 | |
_mm_maskmoveu_si128(x.m_IMM, mask.m_IMM, (char*)buffer); | |
#else | |
const __m128i old = _mm_load_si128((const __m128i*)buffer); | |
const __m128i oldMasked = _mm_andnot_si128(mask.m_IMM, old); | |
const __m128i newMasked = _mm_and_si128(mask.m_IMM, x.m_IMM); | |
const __m128i final = _mm_or_si128(oldMasked, newMasked); | |
_mm_store_si128((__m128i*)buffer, final); | |
#endif | |
} | |
static inline void vec4i_toInt4va_maskedInv_SSE2(vec4i x, vec4i maskInv, int32_t* buffer) | |
{ | |
#if 0 | |
static const uint32_t ones[] = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX }; | |
const __m128i imm_ones = _mm_load_si128((const __m128i*)ones); | |
const __m128i imm_mask = _mm_xor_si128(maskInv.m_IMM, imm_ones); | |
_mm_maskmoveu_si128(x.m_IMM, imm_mask, (char*)buffer); | |
#else | |
const __m128i old = _mm_load_si128((const __m128i*)buffer); | |
const __m128i oldMasked = _mm_and_si128(maskInv.m_IMM, old); | |
const __m128i newMasked = _mm_andnot_si128(maskInv.m_IMM, x.m_IMM); | |
const __m128i final = _mm_or_si128(oldMasked, newMasked); | |
_mm_store_si128((__m128i*)buffer, final); | |
#endif | |
} | |
static inline void vec4i_toInt4va_maskedInv_SSE41(vec4i x, vec4i maskInv, int32_t* buffer) | |
{ | |
const __m128i old = _mm_load_si128((const __m128i*)buffer); | |
const __m128i final = _mm_blendv_epi8(x.m_IMM, old, maskInv.m_IMM); | |
_mm_store_si128((__m128i*)buffer, final); | |
} | |
static inline int32_t vec4i_toInt(vec4i x) | |
{ | |
return _mm_cvtsi128_si32(x.m_IMM); | |
} | |
static inline vec4i vec4i_add(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_add_epi32(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_sub(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_sub_epi32(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_mullo_SSE2(vec4i a, vec4i b) | |
{ | |
const __m128i tmp1 = _mm_mul_epu32(a.m_IMM, b.m_IMM); | |
const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a.m_IMM, 4), _mm_srli_si128(b.m_IMM, 4)); | |
return (vec4i){ .m_IMM = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))) }; | |
} | |
static inline vec4i vec4i_mullo_SSE41(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_mullo_epi32(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_and(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_and_si128(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_or(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_or3(vec4i a, vec4i b, vec4i c) | |
{ | |
return (vec4i){ .m_IMM = _mm_or_si128(a.m_IMM, _mm_or_si128(b.m_IMM, c.m_IMM)) }; | |
} | |
static inline vec4i vec4i_andnot(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_andnot_si128(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_xor(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_xor_si128(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_sar(vec4i x, uint32_t shift) | |
{ | |
return (vec4i){ .m_IMM = _mm_srai_epi32(x.m_IMM, shift) }; | |
} | |
static inline vec4i vec4i_sal(vec4i x, uint32_t shift) | |
{ | |
return (vec4i){ .m_IMM = _mm_slli_epi32(x.m_IMM, shift) }; | |
} | |
static inline vec4i vec4i_cmplt(vec4i a, vec4i b) | |
{ | |
return (vec4i){ .m_IMM = _mm_cmplt_epi32(a.m_IMM, b.m_IMM) }; | |
} | |
static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSE2(vec4i r, vec4i g, vec4i b, vec4i a) | |
{ | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM) | |
); | |
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register | |
// _mm_shuffle_epi8() with SSE2 | |
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF); | |
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 } | |
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask), | |
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8) | |
); | |
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 } | |
const __m128i imm_rgba_p0123_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask), | |
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8) | |
); | |
return (vec4i){ .m_IMM = imm_rgba_p0123_u8 }; | |
} | |
static inline vec4i vec4i_packR32G32B32A32_to_RGBA8_SSSE3(vec4i r, vec4i g, vec4i b, vec4i a) | |
{ | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(r.m_IMM, g.m_IMM), _mm_packs_epi32(b.m_IMM, a.m_IMM) | |
); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
return (vec4i){ .m_IMM = imm_rgba_p0123_u8 }; | |
} | |
static inline bool vec4i_any_neg_SSE2(vec4i x) | |
{ | |
#if 1 | |
return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) != 0; | |
#else | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128i imm_cmp = _mm_cmplt_epi32(x.m_IMM, imm_zero); | |
return _mm_movemask_ps(_mm_castsi128_ps(imm_cmp)) != 0; | |
#endif | |
} | |
static inline bool vec4i_all_neg_SSE2(vec4i x) | |
{ | |
return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)) == 0x0F; | |
} | |
static inline uint32_t vec4i_getSignMask(vec4i x) | |
{ | |
return _mm_movemask_ps(_mm_castsi128_ps(x.m_IMM)); | |
} | |
#define VEC4I_GET_FUNC(swizzle) \ | |
static inline vec4i vec4i_get##swizzle(vec4i x) \ | |
{ \ | |
return (vec4i){ .m_IMM = _mm_shuffle_epi32(x.m_IMM, (uint32_t)(VEC4_SHUFFLE_##swizzle)) }; \ | |
} | |
VEC4I_GET_FUNC(XXXX); | |
VEC4I_GET_FUNC(YYYY); | |
VEC4I_GET_FUNC(ZZZZ); | |
VEC4I_GET_FUNC(WWWW); | |
VEC4I_GET_FUNC(XYXY); | |
VEC4I_GET_FUNC(ZWZW); | |
int32_t swr_absi(int32_t x); | |
int32_t swr_mini(int32_t a, int32_t b); | |
int32_t swr_maxi(int32_t a, int32_t b); | |
int32_t swr_min3i(int32_t a, int32_t b, int32_t c); | |
int32_t swr_max3i(int32_t a, int32_t b, int32_t c); | |
int32_t swr_alignDown(int32_t x, uint32_t align); | |
int32_t swr_alignUp(int32_t x, uint32_t align); | |
int32_t swr_idiv_floor(int32_t numer, int32_t denom); | |
int32_t swr_idiv_ceil(int32_t numer, int32_t denom); | |
#endif | |
#include "inline/swr_math.inl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef SWR_SWR_MATH_H | |
#error "Must be included from swr_math.h" | |
#endif | |
static inline int32_t swr_absi(int32_t x) | |
{ | |
return x < 0 ? -x : x; | |
} | |
static inline int32_t swr_mini(int32_t a, int32_t b) | |
{ | |
return a < b ? a : b; | |
} | |
static inline int32_t swr_maxi(int32_t a, int32_t b) | |
{ | |
return a > b ? a : b; | |
} | |
static inline int32_t swr_min3i(int32_t a, int32_t b, int32_t c) | |
{ | |
return swr_mini(a, swr_mini(b, c)); | |
} | |
static inline int32_t swr_max3i(int32_t a, int32_t b, int32_t c) | |
{ | |
return swr_maxi(a, swr_maxi(b, c)); | |
} | |
static inline int32_t swr_alignDown(int32_t x, uint32_t align) | |
{ | |
return (x / align) * align; | |
} | |
static inline int32_t swr_alignUp(int32_t x, uint32_t align) | |
{ | |
return ((x / align) + ((x % align) != 0 ? 1 : 0)) * align; | |
} | |
static inline int32_t swr_idiv_floor(int32_t numer, int32_t denom) | |
{ | |
return numer / denom; | |
} | |
static inline int32_t swr_idiv_ceil(int32_t numer, int32_t denom) | |
{ | |
return (numer / denom) + ((numer % denom) != 0 ? 1 : 0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment