Created
December 16, 2022 08:17
-
-
Save jdryg/725e408b948ed1d0c02f675e825c7e97 to your computer and use it in GitHub Desktop.
Triangle rasterization experiments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 8x8 monochrome bitmap fonts for rendering | |
* Author: Daniel Hepper <daniel@hepper.net> | |
* | |
* License: Public Domain | |
* | |
* Based on: | |
* // Summary: font8x8.h | |
* // 8x8 monochrome bitmap fonts for rendering | |
* // | |
* // Author: | |
* // Marcel Sondaar | |
* // International Business Machines (public domain VGA fonts) | |
* // | |
* // License: | |
* // Public Domain | |
* | |
* Fetched from: http://dimensionalrift.homelinux.net/combuster/mos3/?p=viewsource&file=/modules/gfx/font8_8.asm | |
**/ | |
#include <stdint.h> | |
// Constant: font8x8_basic | |
// Contains an 8x8 font map for unicode points U+0000 - U+007F (basic latin) | |
uint8_t font8x8_basic[] = { | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0000 (nul) | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0001 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0002 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0003 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0004 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0005 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0006 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0007 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0008 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0009 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000A | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000B | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000C | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000D | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000E | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000F | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0010 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0011 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0012 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0013 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0014 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0015 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0016 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0017 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0018 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0019 | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001A | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001B | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001C | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001D | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001E | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001F | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0020 (space) | |
0x18, 0x3C, 0x3C, 0x18, 0x18, 0x00, 0x18, 0x00, // U+0021 (!) | |
0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0022 (") | |
0x36, 0x36, 0x7F, 0x36, 0x7F, 0x36, 0x36, 0x00, // U+0023 (#) | |
0x0C, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x0C, 0x00, // U+0024 ($) | |
0x00, 0x63, 0x33, 0x18, 0x0C, 0x66, 0x63, 0x00, // U+0025 (%) | |
0x1C, 0x36, 0x1C, 0x6E, 0x3B, 0x33, 0x6E, 0x00, // U+0026 (&) | |
0x06, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0027 (') | |
0x18, 0x0C, 0x06, 0x06, 0x06, 0x0C, 0x18, 0x00, // U+0028 (() | |
0x06, 0x0C, 0x18, 0x18, 0x18, 0x0C, 0x06, 0x00, // U+0029 ()) | |
0x00, 0x66, 0x3C, 0xFF, 0x3C, 0x66, 0x00, 0x00, // U+002A (*) | |
0x00, 0x0C, 0x0C, 0x3F, 0x0C, 0x0C, 0x00, 0x00, // U+002B (+) | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x06, // U+002C (,) | |
0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, // U+002D (-) | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x00, // U+002E (.) | |
0x60, 0x30, 0x18, 0x0C, 0x06, 0x03, 0x01, 0x00, // U+002F (/) | |
0x3E, 0x63, 0x73, 0x7B, 0x6F, 0x67, 0x3E, 0x00, // U+0030 (0) | |
0x0C, 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x3F, 0x00, // U+0031 (1) | |
0x1E, 0x33, 0x30, 0x1C, 0x06, 0x33, 0x3F, 0x00, // U+0032 (2) | |
0x1E, 0x33, 0x30, 0x1C, 0x30, 0x33, 0x1E, 0x00, // U+0033 (3) | |
0x38, 0x3C, 0x36, 0x33, 0x7F, 0x30, 0x78, 0x00, // U+0034 (4) | |
0x3F, 0x03, 0x1F, 0x30, 0x30, 0x33, 0x1E, 0x00, // U+0035 (5) | |
0x1C, 0x06, 0x03, 0x1F, 0x33, 0x33, 0x1E, 0x00, // U+0036 (6) | |
0x3F, 0x33, 0x30, 0x18, 0x0C, 0x0C, 0x0C, 0x00, // U+0037 (7) | |
0x1E, 0x33, 0x33, 0x1E, 0x33, 0x33, 0x1E, 0x00, // U+0038 (8) | |
0x1E, 0x33, 0x33, 0x3E, 0x30, 0x18, 0x0E, 0x00, // U+0039 (9) | |
0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x00, // U+003A (:) | |
0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x06, // U+003B (//) | |
0x18, 0x0C, 0x06, 0x03, 0x06, 0x0C, 0x18, 0x00, // U+003C (<) | |
0x00, 0x00, 0x3F, 0x00, 0x00, 0x3F, 0x00, 0x00, // U+003D (=) | |
0x06, 0x0C, 0x18, 0x30, 0x18, 0x0C, 0x06, 0x00, // U+003E (>) | |
0x1E, 0x33, 0x30, 0x18, 0x0C, 0x00, 0x0C, 0x00, // U+003F (?) | |
0x3E, 0x63, 0x7B, 0x7B, 0x7B, 0x03, 0x1E, 0x00, // U+0040 (@) | |
0x0C, 0x1E, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x00, // U+0041 (A) | |
0x3F, 0x66, 0x66, 0x3E, 0x66, 0x66, 0x3F, 0x00, // U+0042 (B) | |
0x3C, 0x66, 0x03, 0x03, 0x03, 0x66, 0x3C, 0x00, // U+0043 (C) | |
0x1F, 0x36, 0x66, 0x66, 0x66, 0x36, 0x1F, 0x00, // U+0044 (D) | |
0x7F, 0x46, 0x16, 0x1E, 0x16, 0x46, 0x7F, 0x00, // U+0045 (E) | |
0x7F, 0x46, 0x16, 0x1E, 0x16, 0x06, 0x0F, 0x00, // U+0046 (F) | |
0x3C, 0x66, 0x03, 0x03, 0x73, 0x66, 0x7C, 0x00, // U+0047 (G) | |
0x33, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x33, 0x00, // U+0048 (H) | |
0x1E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+0049 (I) | |
0x78, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, 0x00, // U+004A (J) | |
0x67, 0x66, 0x36, 0x1E, 0x36, 0x66, 0x67, 0x00, // U+004B (K) | |
0x0F, 0x06, 0x06, 0x06, 0x46, 0x66, 0x7F, 0x00, // U+004C (L) | |
0x63, 0x77, 0x7F, 0x7F, 0x6B, 0x63, 0x63, 0x00, // U+004D (M) | |
0x63, 0x67, 0x6F, 0x7B, 0x73, 0x63, 0x63, 0x00, // U+004E (N) | |
0x1C, 0x36, 0x63, 0x63, 0x63, 0x36, 0x1C, 0x00, // U+004F (O) | |
0x3F, 0x66, 0x66, 0x3E, 0x06, 0x06, 0x0F, 0x00, // U+0050 (P) | |
0x1E, 0x33, 0x33, 0x33, 0x3B, 0x1E, 0x38, 0x00, // U+0051 (Q) | |
0x3F, 0x66, 0x66, 0x3E, 0x36, 0x66, 0x67, 0x00, // U+0052 (R) | |
0x1E, 0x33, 0x07, 0x0E, 0x38, 0x33, 0x1E, 0x00, // U+0053 (S) | |
0x3F, 0x2D, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+0054 (T) | |
0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x3F, 0x00, // U+0055 (U) | |
0x33, 0x33, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00, // U+0056 (V) | |
0x63, 0x63, 0x63, 0x6B, 0x7F, 0x77, 0x63, 0x00, // U+0057 (W) | |
0x63, 0x63, 0x36, 0x1C, 0x1C, 0x36, 0x63, 0x00, // U+0058 (X) | |
0x33, 0x33, 0x33, 0x1E, 0x0C, 0x0C, 0x1E, 0x00, // U+0059 (Y) | |
0x7F, 0x63, 0x31, 0x18, 0x4C, 0x66, 0x7F, 0x00, // U+005A (Z) | |
0x1E, 0x06, 0x06, 0x06, 0x06, 0x06, 0x1E, 0x00, // U+005B ([) | |
0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0x40, 0x00, // U+005C (\) | |
0x1E, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1E, 0x00, // U+005D (]) | |
0x08, 0x1C, 0x36, 0x63, 0x00, 0x00, 0x00, 0x00, // U+005E (^) | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, // U+005F (_) | |
0x0C, 0x0C, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0060 (`) | |
0x00, 0x00, 0x1E, 0x30, 0x3E, 0x33, 0x6E, 0x00, // U+0061 (a) | |
0x07, 0x06, 0x06, 0x3E, 0x66, 0x66, 0x3B, 0x00, // U+0062 (b) | |
0x00, 0x00, 0x1E, 0x33, 0x03, 0x33, 0x1E, 0x00, // U+0063 (c) | |
0x38, 0x30, 0x30, 0x3e, 0x33, 0x33, 0x6E, 0x00, // U+0064 (d) | |
0x00, 0x00, 0x1E, 0x33, 0x3f, 0x03, 0x1E, 0x00, // U+0065 (e) | |
0x1C, 0x36, 0x06, 0x0f, 0x06, 0x06, 0x0F, 0x00, // U+0066 (f) | |
0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x1F, // U+0067 (g) | |
0x07, 0x06, 0x36, 0x6E, 0x66, 0x66, 0x67, 0x00, // U+0068 (h) | |
0x0C, 0x00, 0x0E, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+0069 (i) | |
0x30, 0x00, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, // U+006A (j) | |
0x07, 0x06, 0x66, 0x36, 0x1E, 0x36, 0x67, 0x00, // U+006B (k) | |
0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+006C (l) | |
0x00, 0x00, 0x33, 0x7F, 0x7F, 0x6B, 0x63, 0x00, // U+006D (m) | |
0x00, 0x00, 0x1F, 0x33, 0x33, 0x33, 0x33, 0x00, // U+006E (n) | |
0x00, 0x00, 0x1E, 0x33, 0x33, 0x33, 0x1E, 0x00, // U+006F (o) | |
0x00, 0x00, 0x3B, 0x66, 0x66, 0x3E, 0x06, 0x0F, // U+0070 (p) | |
0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x78, // U+0071 (q) | |
0x00, 0x00, 0x3B, 0x6E, 0x66, 0x06, 0x0F, 0x00, // U+0072 (r) | |
0x00, 0x00, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x00, // U+0073 (s) | |
0x08, 0x0C, 0x3E, 0x0C, 0x0C, 0x2C, 0x18, 0x00, // U+0074 (t) | |
0x00, 0x00, 0x33, 0x33, 0x33, 0x33, 0x6E, 0x00, // U+0075 (u) | |
0x00, 0x00, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00, // U+0076 (v) | |
0x00, 0x00, 0x63, 0x6B, 0x7F, 0x7F, 0x36, 0x00, // U+0077 (w) | |
0x00, 0x00, 0x63, 0x36, 0x1C, 0x36, 0x63, 0x00, // U+0078 (x) | |
0x00, 0x00, 0x33, 0x33, 0x33, 0x3E, 0x30, 0x1F, // U+0079 (y) | |
0x00, 0x00, 0x3F, 0x19, 0x0C, 0x26, 0x3F, 0x00, // U+007A (z) | |
0x38, 0x0C, 0x0C, 0x07, 0x0C, 0x0C, 0x38, 0x00, // U+007B ({) | |
0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00, // U+007C (|) | |
0x07, 0x0C, 0x0C, 0x38, 0x0C, 0x0C, 0x07, 0x00, // U+007D (}) | |
0x6E, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+007E (~) | |
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 // U+007F | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _CRT_SECURE_NO_WARNINGS | |
#include "minifb/MiniFB.h" | |
#include "font8x8_basic.h" | |
#include <stdint.h> | |
#include <malloc.h> | |
#include <memory.h> | |
#include <math.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <immintrin.h> | |
#include <assert.h> | |
#define FRAMEBUFFER_CONFIG_WIDTH 1024 | |
#define FRAMEBUFFER_CONFIG_HEIGHT 1024 | |
#define SWR_COLOR_FORMAT_RGBA 0 | |
#define SWR_COLOR_FORMAT_BGRA 1 | |
#ifndef SWR_FRAMEBUFFER_FORMAT | |
#define SWR_FRAMEBUFFER_FORMAT SWR_COLOR_FORMAT_BGRA | |
#endif | |
#if SWR_FRAMEBUFFER_FORMAT == SWR_COLOR_FORMAT_BGRA | |
#define SWR_COLOR_BLUE_Pos 0 | |
#define SWR_COLOR_GREEN_Pos 8 | |
#define SWR_COLOR_RED_Pos 16 | |
#define SWR_COLOR_ALPHA_Pos 24 | |
#else | |
#define SWR_COLOR_RED_Pos 0 | |
#define SWR_COLOR_GREEN_Pos 8 | |
#define SWR_COLOR_BLUE_Pos 16 | |
#define SWR_COLOR_ALPHA_Pos 24 | |
#endif | |
#define SWR_COLOR_RED_Msk (0xFF << SWR_COLOR_RED_Pos) | |
#define SWR_COLOR_GREEN_Msk (0xFF << SWR_COLOR_GREEN_Pos) | |
#define SWR_COLOR_BLUE_Msk (0xFF << SWR_COLOR_BLUE_Pos) | |
#define SWR_COLOR_ALPHA_Msk (0xFF << SWR_COLOR_ALPHA_Pos) | |
#define SWR_COLOR(r, g, b, a) (0 \ | |
| (((r) << SWR_COLOR_RED_Pos) & SWR_COLOR_RED_Msk) \ | |
| (((g) << SWR_COLOR_GREEN_Pos) & SWR_COLOR_GREEN_Msk) \ | |
| (((b) << SWR_COLOR_BLUE_Pos) & SWR_COLOR_BLUE_Msk) \ | |
| (((a) << SWR_COLOR_ALPHA_Pos) & SWR_COLOR_ALPHA_Msk)) | |
#define SWR_COLOR_BLACK SWR_COLOR(0, 0, 0, 255) | |
#define SWR_COLOR_RED SWR_COLOR(255, 0, 0, 255) | |
#define SWR_COLOR_GREEN SWR_COLOR(0, 255, 0, 255) | |
#define SWR_COLOR_BLUE SWR_COLOR(0, 0, 255, 255) | |
#define SWR_COLOR_YELLOW SWR_COLOR(255, 255, 0, 255) | |
#define SWR_COLOR_WHITE SWR_COLOR(255, 255, 255, 255) | |
typedef struct vec2f | |
{ | |
float x; | |
float y; | |
} vec2f; | |
static const vec2f s_Points[] = { | |
{ .x = 0.00000000f, .y = 0.00000000f }, | |
{ .x = 0.00000000f, .y = 5.07142878f }, | |
{ .x = 5.07142878f, .y = 0.00000000f }, | |
{ .x = 0.00000000f, .y = -5.07142878f }, | |
{ .x = -5.07142830f, .y = 0.00000000f }, | |
{ .x = 3.88150382f, .y = 9.37077808f }, | |
{ .x = 9.37077904f, .y = 3.88150382f }, | |
{ .x = 9.37077904f, .y = -3.88150287f }, | |
{ .x = 3.88150501f, .y = -9.37077808f }, | |
{ .x = -3.88150501f, .y = -9.37077713f }, | |
{ .x = -9.37077713f, .y = -3.88150549f }, | |
{ .x = -9.37077904f, .y = 3.88150501f }, | |
{ .x = -3.88150144f, .y = 9.37077999f }, | |
{ .x = 0.00000000f, .y = 15.2142859f }, | |
{ .x = 7.60714293f, .y = 13.1759577f }, | |
{ .x = 13.1759586f, .y = 7.60714197f }, | |
{ .x = 15.2142859f, .y = 0.00000000f }, | |
{ .x = 13.1759577f, .y = -7.60714293f }, | |
{ .x = 7.60714102f, .y = -13.1759586f }, | |
{ .x = 0.00000000f, .y = -15.2142859f }, | |
{ .x = -7.60714293f, .y = -13.1759567f }, | |
{ .x = -13.1759567f, .y = -7.60714197f }, | |
{ .x = -15.2142849f, .y = 0.00000000f }, | |
{ .x = -13.1759567f, .y = 7.60714769f }, | |
{ .x = -7.60714197f, .y = 13.1759596f }, | |
{ .x = 3.95754576f, .y = 19.8959293f }, | |
{ .x = 11.2701397f, .y = 16.8669548f }, | |
{ .x = 16.8669548f, .y = 11.2701397f }, | |
{ .x = 19.8959293f, .y = 3.95754743f }, | |
{ .x = 19.8959293f, .y = -3.95754814f }, | |
{ .x = 16.8669567f, .y = -11.2701368f }, | |
{ .x = 11.2701387f, .y = -16.8669567f }, | |
{ .x = 3.95754361f, .y = -19.8959293f }, | |
{ .x = -3.95754337f, .y = -19.8959312f }, | |
{ .x = -11.2701368f, .y = -16.8669548f }, | |
{ .x = -16.8669567f, .y = -11.2701368f }, | |
{ .x = -19.8959293f, .y = -3.95754004f }, | |
{ .x = -19.8959312f, .y = 3.95754814f }, | |
{ .x = -16.8669548f, .y = 11.2701435f }, | |
{ .x = -11.2701445f, .y = 16.8669548f }, | |
{ .x = -3.95754981f, .y = 19.8959293f }, | |
{ .x = 0.00000000f, .y = 25.3571434f }, | |
{ .x = 7.83578825f, .y = 24.1160774f }, | |
{ .x = 14.9045563f, .y = 20.5143585f }, | |
{ .x = 20.5143604f, .y = 14.9045544f }, | |
{ .x = 24.1160755f, .y = 7.83578682f }, | |
{ .x = 25.3571434f, .y = 0.00000000f }, | |
{ .x = 24.1160774f, .y = -7.83578825f }, | |
{ .x = 20.5143585f, .y = -14.9045582f }, | |
{ .x = 14.9045544f, .y = -20.5143604f }, | |
{ .x = 7.83579159f, .y = -24.1160755f }, | |
{ .x = 0.00000000f, .y = -25.3571434f }, | |
{ .x = -7.83578539f, .y = -24.1160774f }, | |
{ .x = -14.9045534f, .y = -20.5143585f }, | |
{ .x = -20.5143604f, .y = -14.9045515f }, | |
{ .x = -24.1160793f, .y = -7.83578110f }, | |
{ .x = -25.3571415f, .y = 0.00000000f }, | |
{ .x = -24.1160774f, .y = 7.83579159f }, | |
{ .x = -20.5143585f, .y = 14.9045610f }, | |
{ .x = -14.9045620f, .y = 20.5143585f }, | |
{ .x = -7.83579302f, .y = 24.1160755f }, | |
{ .x = 0.00000000f, .y = 30.4285717f }, | |
{ .x = 7.87549543f, .y = 29.3917427f }, | |
{ .x = 15.2142859f, .y = 26.3519154f }, | |
{ .x = 21.5162487f, .y = 21.5162468f }, | |
{ .x = 26.3519173f, .y = 15.2142839f }, | |
{ .x = 29.3917446f, .y = 7.87549210f }, | |
{ .x = 30.4285717f, .y = 0.00000000f }, | |
{ .x = 29.3917427f, .y = -7.87549543f }, | |
{ .x = 26.3519154f, .y = -15.2142859f }, | |
{ .x = 21.5162506f, .y = -21.5162487f }, | |
{ .x = 15.2142820f, .y = -26.3519173f }, | |
{ .x = 7.87549210f, .y = -29.3917446f }, | |
{ .x = 0.00000000f, .y = -30.4285717f }, | |
{ .x = -7.87549925f, .y = -29.3917408f }, | |
{ .x = -15.2142859f, .y = -26.3519135f }, | |
{ .x = -21.5162487f, .y = -21.5162525f }, | |
{ .x = -26.3519135f, .y = -15.2142839f }, | |
{ .x = -29.3917408f, .y = -7.87549400f }, | |
{ .x = -30.4285698f, .y = 0.00000000f }, | |
{ .x = -29.3917408f, .y = 7.87550640f }, | |
{ .x = -26.3519135f, .y = 15.2142954f }, | |
{ .x = -21.5162582f, .y = 21.5162449f }, | |
{ .x = -15.2142839f, .y = 26.3519192f }, | |
{ .x = -7.87549400f, .y = 29.3917446f }, | |
{ .x = 0.00000000f, .y = 35.5000000f }, | |
{ .x = 7.38086557f, .y = 34.7242393f }, | |
{ .x = 14.4391518f, .y = 32.4308624f }, | |
{ .x = 20.8663788f, .y = 28.7201023f }, | |
{ .x = 26.3816433f, .y = 23.7541351f }, | |
{ .x = 30.7439041f, .y = 17.7499981f }, | |
{ .x = 33.7625046f, .y = 10.9701014f }, | |
{ .x = 35.3055267f, .y = 3.71075916f }, | |
{ .x = 35.3055267f, .y = -3.71076059f }, | |
{ .x = 33.7625084f, .y = -10.9701033f }, | |
{ .x = 30.7439022f, .y = -17.7500000f }, | |
{ .x = 26.3816414f, .y = -23.7541370f }, | |
{ .x = 20.8663769f, .y = -28.7201042f }, | |
{ .x = 14.4391499f, .y = -32.4308662f }, | |
{ .x = 7.38086414f, .y = -34.7242393f }, | |
{ .x = 0.00000000f, .y = -35.5000000f }, | |
{ .x = -7.38086557f, .y = -34.7242393f }, | |
{ .x = -14.4391594f, .y = -32.4308624f }, | |
{ .x = -20.8663750f, .y = -28.7201023f }, | |
{ .x = -26.3816452f, .y = -23.7541351f }, | |
{ .x = -30.7438984f, .y = -17.7499981f }, | |
{ .x = -33.7625084f, .y = -10.9700928f }, | |
{ .x = -33.8194656f, .y = -3.71075916f }, | |
{ .x = -33.8194656f, .y = 3.71075630f }, | |
{ .x = -33.7625084f, .y = 10.9701080f }, | |
{ .x = -30.7438984f, .y = 17.7500114f }, | |
{ .x = -26.3816433f, .y = 23.7541409f }, | |
{ .x = -20.8663864f, .y = 28.7201023f }, | |
{ .x = -14.4391537f, .y = 32.4308662f }, | |
{ .x = -7.38085985f, .y = 34.7242432f }, | |
}; | |
static const uint32_t kNumPoints = sizeof(s_Points) / sizeof(s_Points[0]); | |
static const uint32_t s_Colors[] = { | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
0xffff00fe, | |
0xffff000e, | |
0xff00ff2b, | |
0xff0000ff, | |
0xffff6c00, | |
}; | |
static const uint16_t s_Indices[] = { | |
107, 106, 78 , | |
107, 78 , 79 , | |
107, 79 , 108, | |
108, 79 , 80 , | |
108, 80 , 109, | |
106, 105, 77 , | |
106, 77 , 78 , | |
109, 80 , 81 , | |
109, 81 , 110, | |
105, 104, 77 , | |
110, 81 , 111, | |
79 , 78 , 56 , | |
79 , 56 , 80 , | |
78 , 77 , 55 , | |
78 , 55 , 56 , | |
80 , 56 , 57 , | |
80 , 57 , 81 , | |
104, 103, 76 , | |
104, 76 , 77 , | |
111, 81 , 82 , | |
111, 82 , 112, | |
77 , 76 , 54 , | |
77 , 54 , 55 , | |
81 , 57 , 58 , | |
81 , 58 , 82 , | |
56 , 55 , 36 , | |
56 , 36 , 37 , | |
56 , 37 , 57 , | |
55 , 54 , 35 , | |
55 , 35 , 36 , | |
57 , 37 , 38 , | |
57 , 38 , 58 , | |
82 , 58 , 59 , | |
82 , 59 , 83 , | |
82 , 83 , 112, | |
76 , 103, 75 , | |
76 , 75 , 53 , | |
76 , 53 , 54 , | |
112, 83 , 113, | |
103, 102, 75 , | |
54 , 53 , 35 , | |
58 , 38 , 39 , | |
58 , 39 , 59 , | |
37 , 36 , 22 , | |
37 , 22 , 23 , | |
37 , 23 , 38 , | |
36 , 35 , 21 , | |
36 , 21 , 22 , | |
35 , 53 , 34 , | |
35 , 34 , 21 , | |
38 , 23 , 39 , | |
75 , 102, 74 , | |
75 , 74 , 52 , | |
75 , 52 , 53 , | |
22 , 21 , 10 , | |
22 , 10 , 11 , | |
22 , 11 , 23 , | |
83 , 59 , 60 , | |
83 , 60 , 84 , | |
83 , 84 , 113, | |
59 , 39 , 60 , | |
53 , 52 , 34 , | |
102, 101, 74 , | |
113, 84 , 114, | |
21 , 34 , 20 , | |
21 , 20 , 10 , | |
23 , 11 , 12 , | |
23 , 12 , 24 , | |
23 , 24 , 39 , | |
39 , 24 , 40 , | |
39 , 40 , 60 , | |
34 , 52 , 33 , | |
34 , 33 , 20 , | |
11 , 10 , 4 , | |
11 , 4 , 1 , | |
11 , 1 , 12 , | |
10 , 20 , 9 , | |
10 , 9 , 3 , | |
10 , 3 , 4 , | |
74 , 101, 73 , | |
74 , 73 , 51 , | |
74 , 51 , 52 , | |
84 , 60 , 41 , | |
84 , 41 , 61 , | |
84 , 61 , 114, | |
60 , 40 , 41 , | |
52 , 51 , 33 , | |
20 , 33 , 19 , | |
20 , 19 , 9 , | |
24 , 12 , 13 , | |
24 , 13 , 40 , | |
101, 100, 73 , | |
114, 61 , 85 , | |
4 , 3 , 0 , | |
4 , 0 , 1 , | |
40 , 13 , 25 , | |
40 , 25 , 41 , | |
33 , 51 , 32 , | |
33 , 32 , 19 , | |
9 , 19 , 8 , | |
9 , 8 , 3 , | |
12 , 1 , 5 , | |
12 , 5 , 13 , | |
100, 99 , 73 , | |
73 , 99 , 72 , | |
73 , 72 , 51 , | |
51 , 72 , 50 , | |
51 , 50 , 32 , | |
19 , 32 , 18 , | |
19 , 18 , 8 , | |
3 , 8 , 2 , | |
3 , 2 , 0 , | |
0 , 2 , 1 , | |
1 , 2 , 5 , | |
13 , 5 , 14 , | |
13 , 14 , 25 , | |
41 , 25 , 42 , | |
41 , 42 , 62 , | |
41 , 62 , 61 , | |
61 , 62 , 86 , | |
61 , 86 , 85 , | |
5 , 2 , 6 , | |
5 , 6 , 14 , | |
8 , 18 , 17 , | |
8 , 17 , 7 , | |
8 , 7 , 2 , | |
32 , 50 , 31 , | |
32 , 31 , 18 , | |
25 , 14 , 26 , | |
25 , 26 , 42 , | |
2 , 7 , 6 , | |
99 , 98 , 72 , | |
86 , 62 , 87 , | |
18 , 31 , 17 , | |
14 , 6 , 15 , | |
14 , 15 , 27 , | |
14 , 27 , 26 , | |
42 , 26 , 43 , | |
42 , 43 , 63 , | |
42 , 63 , 62 , | |
50 , 72 , 71 , | |
50 , 71 , 49 , | |
50 , 49 , 31 , | |
72 , 98 , 71 , | |
62 , 63 , 87 , | |
7 , 17 , 16 , | |
7 , 16 , 6 , | |
6 , 16 , 15 , | |
31 , 49 , 48 , | |
31 , 48 , 30 , | |
31 , 30 , 17 , | |
26 , 27 , 43 , | |
17 , 30 , 29 , | |
17 , 29 , 16 , | |
15 , 16 , 28 , | |
15 , 28 , 27 , | |
98 , 97 , 71 , | |
87 , 63 , 88 , | |
49 , 71 , 70 , | |
49 , 70 , 48 , | |
43 , 27 , 44 , | |
43 , 44 , 64 , | |
43 , 64 , 63 , | |
71 , 97 , 70 , | |
16 , 29 , 28 , | |
63 , 64 , 88 , | |
27 , 28 , 45 , | |
27 , 45 , 44 , | |
30 , 48 , 47 , | |
30 , 47 , 29 , | |
29 , 47 , 46 , | |
29 , 46 , 28 , | |
28 , 46 , 45 , | |
48 , 70 , 69 , | |
48 , 69 , 47 , | |
44 , 45 , 65 , | |
44 , 65 , 64 , | |
97 , 96 , 70 , | |
88 , 64 , 89 , | |
64 , 65 , 89 , | |
70 , 96 , 69 , | |
45 , 46 , 66 , | |
45 , 66 , 65 , | |
47 , 69 , 68 , | |
47 , 68 , 46 , | |
46 , 68 , 67 , | |
46 , 67 , 66 , | |
69 , 96 , 95 , | |
69 , 95 , 94 , | |
69 , 94 , 68 , | |
65 , 66 , 91 , | |
65 , 91 , 90 , | |
65 , 90 , 89 , | |
68 , 94 , 93 , | |
68 , 93 , 67 , | |
66 , 67 , 92 , | |
66 , 92 , 91 , | |
67 , 93 ,92 , | |
}; | |
static const uint32_t kNumIndices = sizeof(s_Indices) / sizeof(s_Indices[0]); | |
typedef struct swr_context | |
{ | |
uint32_t* m_FrameBuffer; | |
uint32_t m_Width; | |
uint32_t m_Height; | |
} swr_context; | |
typedef struct swr_font | |
{ | |
uint8_t* m_CharData; | |
uint32_t m_CharWidth; | |
uint32_t m_CharHeight; | |
uint8_t m_CharMin; | |
uint8_t m_CharMax; | |
uint8_t m_MissingCharFallbackID; | |
} swr_font; | |
static const swr_font* s_Font8x8 = &(swr_font){ | |
.m_CharData = font8x8_basic, | |
.m_CharWidth = 8, | |
.m_CharHeight = 8, | |
.m_CharMin = 0, | |
.m_CharMax = 0x7f, | |
.m_MissingCharFallbackID = 0 | |
}; | |
static swr_context* swrCreateContext(uint32_t w, uint32_t h); | |
static void swrDestroyContext(swr_context* ctx); | |
static void swrClear(swr_context* ctx, uint32_t color); | |
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color); | |
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color); | |
static void swrDrawTriangle(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_1(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_5(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_6(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_7(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_8(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_9(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_SSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawTriangle_SSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2); | |
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x, int32_t y, const char* str, const char* end, uint32_t color); | |
typedef struct moving_averaged | |
{ | |
double m_Value[128]; | |
uint32_t m_NumItems; | |
uint32_t m_NextItemID; | |
} moving_averaged; | |
static void movAvgPush(moving_averaged* avg, double val); | |
static double movAvgGetAverage(const moving_averaged* avg); | |
static void movAvgGetMinMax(const moving_averaged* avg, double* tmin, double* tmax); | |
int32_t main(void) | |
{ | |
swr_context* ctx = swrCreateContext(FRAMEBUFFER_CONFIG_WIDTH, FRAMEBUFFER_CONFIG_HEIGHT); | |
if (!ctx) { | |
return -1; | |
} | |
struct mfb_window* window = mfb_open_ex("swr", ctx->m_Width, ctx->m_Height, 0u); | |
if (!window) { | |
swrDestroyContext(ctx); | |
return -2; | |
} | |
struct mfb_timer* timer = mfb_timer_create(); | |
// Calculate bounding rect of original mesh | |
vec2f bboxMin = s_Points[0]; | |
vec2f bboxMax = s_Points[0]; | |
for (uint32_t i = 1; i < kNumPoints; ++i) { | |
const vec2f* pt = &s_Points[i]; | |
bboxMin.x = pt->x < bboxMin.x ? pt->x : bboxMin.x; | |
bboxMin.y = pt->y < bboxMin.y ? pt->y : bboxMin.y; | |
bboxMax.x = pt->x > bboxMax.x ? pt->x : bboxMax.x; | |
bboxMax.y = pt->y > bboxMax.y ? pt->y : bboxMax.y; | |
} | |
// Transform all points to image space... | |
vec2f* transformedPoints = (vec2f*)malloc(sizeof(vec2f) * kNumPoints); | |
for (uint32_t i = 0; i < kNumPoints; ++i) { | |
const vec2f* pt = &s_Points[i]; | |
vec2f transPt; | |
transPt.y = (1.0f - ((pt->x - bboxMin.x) / (bboxMax.x - bboxMin.x))) * (ctx->m_Width - 64) + 32; | |
transPt.x = ((pt->y - bboxMin.y) / (bboxMax.y - bboxMin.y)) * (ctx->m_Height - 64) + 32; | |
transformedPoints[i] = transPt; | |
} | |
moving_averaged frameTimeAvg; | |
memset(&frameTimeAvg, 0, sizeof(moving_averaged)); | |
do { | |
mfb_timer_reset(timer); | |
{ | |
swrClear(ctx, SWR_COLOR_BLACK); | |
#if 1 | |
const uint32_t numTris = kNumIndices / 3; | |
for (uint32_t i = 0; i < numTris; ++i) { | |
const uint16_t id0 = s_Indices[i * 3 + 0]; | |
const uint16_t id1 = s_Indices[i * 3 + 1]; | |
const uint16_t id2 = s_Indices[i * 3 + 2]; | |
const vec2f* pt0 = &transformedPoints[id0]; | |
const vec2f* pt1 = &transformedPoints[id1]; | |
const vec2f* pt2 = &transformedPoints[id2]; | |
swrDrawTriangle_SSE41(ctx | |
, (int32_t)pt0->x, (int32_t)pt0->y | |
, (int32_t)pt1->x, (int32_t)pt1->y | |
, (int32_t)pt2->x, (int32_t)pt2->y | |
, s_Colors[id0] | |
, s_Colors[id1] | |
, s_Colors[id2] | |
); | |
} | |
#else | |
swrDrawTriangle(ctx, -10, 80, 200, -10, 300, 200, SWR_COLOR_RED, SWR_COLOR_GREEN, SWR_COLOR_BLUE); | |
#endif | |
} | |
const double dt = mfb_timer_delta(timer); | |
movAvgPush(&frameTimeAvg, dt * 1000.0); | |
{ | |
const double tAvg = movAvgGetAverage(&frameTimeAvg); | |
double tMin, tMax; | |
movAvgGetMinMax(&frameTimeAvg, &tMin, &tMax); | |
char str[256]; | |
sprintf(str, "Frame Time: %.2fms (avg: %.2fms, min: %.2fms, max: %.2fms)", dt * 1000.0, tAvg, tMin, tMax); | |
swrDrawText(ctx, s_Font8x8, 8, 8, str, NULL, SWR_COLOR_WHITE); | |
} | |
int32_t state = mfb_update_ex(window, ctx->m_FrameBuffer, ctx->m_Width, ctx->m_Height); | |
if (state < 0) { | |
window = NULL; | |
break; | |
} | |
} while (mfb_wait_sync(window)); | |
mfb_close(window); | |
swrDestroyContext(ctx); | |
return 0; | |
} | |
////////////////////////////////////////////////////////////////////////// | |
// SoftRast | |
// | |
static swr_context* swrCreateContext(uint32_t w, uint32_t h) | |
{ | |
swr_context* ctx = (swr_context*)malloc(sizeof(swr_context)); | |
if (!ctx) { | |
return NULL; | |
} | |
memset(ctx, 0, sizeof(swr_context)); | |
ctx->m_FrameBuffer = (uint32_t*)malloc(sizeof(uint32_t) * (size_t)w * (size_t)h); | |
if (!ctx->m_FrameBuffer) { | |
swrDestroyContext(ctx); | |
return NULL; | |
} | |
memset(ctx->m_FrameBuffer, 0, sizeof(uint32_t) * (size_t)w * (size_t)h); | |
ctx->m_Width = w; | |
ctx->m_Height = h; | |
return ctx; | |
} | |
static void swrDestroyContext(swr_context* ctx) | |
{ | |
free(ctx->m_FrameBuffer); | |
free(ctx); | |
} | |
static void swrClear(swr_context* ctx, uint32_t color) | |
{ | |
uint32_t* buffer = ctx->m_FrameBuffer; | |
const uint32_t numPixels = ctx->m_Width * ctx->m_Height; | |
for (uint32_t i = 0; i < numPixels; ++i) { | |
*buffer++ = color; | |
} | |
} | |
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color) | |
{ | |
if (x < 0 || x >= (int32_t)ctx->m_Width || y < 0 || y >= (int32_t)ctx->m_Height) { | |
return; | |
} | |
ctx->m_FrameBuffer[x + y * ctx->m_Width] = color; | |
} | |
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color) | |
{ | |
bool steep = false; | |
if (abs(x0 - x1) < abs(y0 - y1)) { | |
{ int32_t tmp = x0; x0 = y0; y0 = tmp; } | |
{ int32_t tmp = x1; x1 = y1; y1 = tmp; } | |
steep = true; | |
} | |
if (x0 > x1) { | |
{ int32_t tmp = x0; x0 = x1; x1 = tmp; } | |
{ int32_t tmp = y0; y0 = y1; y1 = tmp; } | |
} | |
const int32_t dx = x1 - x0; | |
const int32_t derror2 = abs(y1 - y0) * 2; | |
const int32_t yinc = y1 > y0 ? 1 : -1; | |
int32_t error2 = 0; | |
int32_t y = y0; | |
if (steep) { | |
for (int32_t x = x0; x <= x1; x++) { | |
swrDrawPixel(ctx, y, x, color); | |
error2 += derror2; | |
if (error2 > dx) { | |
y += yinc; | |
error2 -= dx * 2; | |
} | |
} | |
} else { | |
for (int32_t x = x0; x <= x1; x++) { | |
swrDrawPixel(ctx, x, y, color); | |
error2 += derror2; | |
if (error2 > dx) { | |
y += yinc; | |
error2 -= dx * 2; | |
} | |
} | |
} | |
} | |
static inline int32_t swr_mini(int32_t a, int32_t b) | |
{ | |
return a < b ? a : b; | |
} | |
static inline int32_t swr_maxi(int32_t a, int32_t b) | |
{ | |
return a > b ? a : b; | |
} | |
static inline int32_t swr_min3i(int32_t a, int32_t b, int32_t c) | |
{ | |
return swr_mini(a, swr_mini(b, c)); | |
} | |
static inline int32_t swr_max3i(int32_t a, int32_t b, int32_t c) | |
{ | |
return swr_maxi(a, swr_maxi(b, c)); | |
} | |
static bool swr_calcBarycentricCoords(int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, int32_t x, int32_t y, float* bc) | |
{ | |
const float dx20 = (float)(x2 - x0); | |
const float dx10 = (float)(x1 - x0); | |
const float dy20 = (float)(y2 - y0); | |
const float dy10 = (float)(y1 - y0); | |
const float uz = dx20 * dy10 - dx10 * dy20; | |
if (fabsf(uz) < 1.0f) { | |
return false; | |
} | |
const float dx0p = (float)(x0 - x); | |
const float dy0p = (float)(y0 - y); | |
const float ux = dx10 * dy0p - dx0p * dy10; | |
const float uy = dx0p * dy20 - dx20 * dy0p; | |
bc[0] = 1.0f - ((ux + uy) / uz); | |
bc[1] = uy / uz; | |
bc[2] = ux / uz; | |
return bc[0] >= 0.0f && bc[1] >= 0.0f && bc[2] >= 0.0f; | |
} | |
// avg: 8.82ms, min: 8.44ms, max: 9.28ms | |
static void swrDrawTriangle_1(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos; | |
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos; | |
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos; | |
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos; | |
for (int32_t y = bboxMinY; y <= bboxMaxY; ++y) { | |
const uint32_t y_width = (uint32_t)y * ctx->m_Width; | |
for (int32_t x = bboxMinX; x <= bboxMaxX; ++x) { | |
float barycentricCoords[3]; | |
if (!swr_calcBarycentricCoords(x0, y0, x1, y1, x2, y2, x, y, &barycentricCoords[0])) { | |
continue; | |
} | |
const uint32_t cr = (uint32_t)(c0r * barycentricCoords[0] + c1r * barycentricCoords[1] + c2r * barycentricCoords[2]); | |
const uint32_t cg = (uint32_t)(c0g * barycentricCoords[0] + c1g * barycentricCoords[1] + c2g * barycentricCoords[2]); | |
const uint32_t cb = (uint32_t)(c0b * barycentricCoords[0] + c1b * barycentricCoords[1] + c2b * barycentricCoords[2]); | |
const uint32_t ca = (uint32_t)(c0a * barycentricCoords[0] + c1a * barycentricCoords[1] + c2a * barycentricCoords[2]); | |
swrDrawPixel(ctx, x, y, SWR_COLOR(cr, cg, cb, ca)); | |
} | |
} | |
} | |
// avg: 4.95ms, min: 4.72ms, max: 5.24ms | |
static void swrDrawTriangle_2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const __m128i xmm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero)); | |
for (int32_t y = bboxMinY; y <= bboxMaxY; ++y) { | |
const uint32_t y_width = (uint32_t)y * ctx->m_Width; | |
for (int32_t x = bboxMinX; x <= bboxMaxX; ++x) { | |
float barycentricCoords[3]; | |
if (!swr_calcBarycentricCoords(x0, y0, x1, y1, x2, y2, x, y, &barycentricCoords[0])) { | |
continue; | |
} | |
__m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, _mm_load1_ps(&barycentricCoords[0])); | |
__m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, _mm_load1_ps(&barycentricCoords[1])); | |
__m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, _mm_load1_ps(&barycentricCoords[2])); | |
__m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled); | |
__m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero); | |
_mm_storeu_si32(&ctx->m_FrameBuffer[x + y_width], imm_c); | |
} | |
} | |
} | |
// avg: 2.84ms, min: 2.67ms, max: 3.04ms | |
static void swrDrawTriangle_3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
const int32_t dx20 = x2 - x0; | |
const int32_t dx10 = x1 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy10 = y1 - y0; | |
const int32_t iuz = dx20 * dy10 - dx10 * dy20; | |
if (abs(iuz) < 1) { | |
return; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const __m128i xmm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero)); | |
const float inv_uz = 1.0f / (float)iuz; | |
for (int32_t y = bboxMinY; y <= bboxMaxY; ++y) { | |
const int32_t dy0p = y0 - y; | |
const int32_t dx10_dy0p = dx10 * dy0p; | |
const int32_t dx20_dy0p = dx20 * dy0p; | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[y * ctx->m_Width]; | |
for (int32_t x = bboxMinX; x <= bboxMaxX; ++x) { | |
const int32_t dx0p = x0 - x; | |
const int32_t iux = dx10_dy0p - dx0p * dy10; | |
const int32_t iuy = dx0p * dy20 - dx20_dy0p; | |
const float bcx = (float)iux * inv_uz; | |
const float bcy = (float)iuy * inv_uz; | |
const float bcz = 1.0f - (bcx + bcy); | |
if (bcz < 0.0f || bcy < 0.0f || bcx < 0.0f) { | |
continue; | |
} | |
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, _mm_load1_ps(&bcz)); | |
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, _mm_load1_ps(&bcy)); | |
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, _mm_load1_ps(&bcx)); | |
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled); | |
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero); | |
_mm_storeu_si32(&framebufferRow[x], imm_c); | |
} | |
} | |
} | |
// avg: 1.66ms, min: 1.54ms, max: 1.99ms | |
static void swrDrawTriangle_4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
const int32_t dx20 = x2 - x0; | |
const int32_t dx10 = x1 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy10 = y1 - y0; | |
const int32_t iarea = dx20 * dy10 - dx10 * dy20; | |
if (iarea == 0) { | |
return; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i xmm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero)); | |
const float inv_area = 1.0f / (float)iarea; | |
const int32_t dy01 = -dy10; | |
const int32_t dx01 = -dx10; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const bool iarea_dy01_samesign = (iarea > 0 && dy01 > 0) || (iarea < 0 && dy01 < 0); | |
const bool iarea_dy20_samesign = (iarea > 0 && dy20 > 0) || (iarea < 0 && dy20 < 0); | |
const bool iarea_dy01dy20_samesign = (iarea > 0 && dy01_dy20 > 0) || (iarea < 0 && dy01_dy20 < 0); | |
int32_t ivx = ((x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01); | |
int32_t ivy = ((x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20); | |
int32_t ivz = iarea - ivx - ivy; | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
if (iarea_dy01_samesign) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)ivx / (float)dy01)); | |
} else if (ivx != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)ivx / (float)dy01)); | |
} | |
if (iarea_dy20_samesign) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)ivy / (float)dy20)); | |
} else if (ivy != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)ivy / (float)dy20)); | |
} | |
if ((iarea > 0 && dy01_dy20 < 0 && ivz >= 0) || (iarea < 0 && dy01_dy20 > 0 && ivz <= 0)) { | |
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)ivz / (float)dy01_dy20)); | |
} else if ((iarea > 0 && dy01_dy20 > 0 && ivz < 0) || (iarea < 0 && dy01_dy20 < 0 && ivz > 0)) { | |
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)ivz / (float)dy01_dy20)); | |
} | |
int32_t iux = ivx - ixmin * dy01; | |
int32_t iuy = ivy - ixmin * dy20; | |
int32_t iuz = ivz + ixmin * dy01_dy20; | |
for (int32_t ix = ixmin; ix <= ixmax; ++ix) { | |
const float bcx = (float)iux * inv_area; | |
const float bcy = (float)iuy * inv_area; | |
const float bcz = (float)iuz * inv_area; | |
assert(bcx >= 0.0f && bcy >= 0.0f && bcz >= 0.0f); | |
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, _mm_load1_ps(&bcz)); | |
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, _mm_load1_ps(&bcy)); | |
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, _mm_load1_ps(&bcx)); | |
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled); | |
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero); | |
_mm_storeu_si32(&framebufferRow[ix], imm_c); | |
iux -= dy01; | |
iuy -= dy20; | |
iuz += dy01_dy20; | |
} | |
ivx += dx01; | |
ivy += dx20; | |
ivz -= dx01 + dx20; | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// Same as swrDrawTriangle_4() but with X and Y counters in XMM regs. | |
// | |
// avg: 1.38ms, min, 1.29ms, max: 1.67ms | |
static void swrDrawTriangle_5(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t dx20 = x2 - x0; | |
const int32_t dx10 = x1 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy10 = y1 - y0; | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i xmm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero)); | |
const int32_t dy01 = -dy10; | |
const int32_t dx01 = -dx10; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
const float inv_dy01 = 1.0f / (float)dy01; | |
const float inv_dy20 = 1.0f / (float)dy20; | |
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20; | |
const int32_t dx0min = x0 - bboxMinX; | |
const int32_t dy0min = y0 - bboxMinY; | |
const __m128i imm_div = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
const int32_t iv0 = dx0min * dy01 - dy0min * dx01; | |
const int32_t iv1 = dx0min * dy20 - dy0min * dx20; | |
const int32_t iv2 = iarea - iv0 - iv1; | |
__m128i imm_iv = _mm_set_epi32(0, iv2, iv1, iv0); | |
const __m128i imm_diu = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t iv[4]; | |
_mm_storeu_si128((__m128i*)&iv[0], imm_iv); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)iv[0] * inv_dy01)); | |
} else if (iv[0] != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)iv[0] * inv_dy01)); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)iv[1] * inv_dy20)); | |
} else if (iv[1] != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)iv[1] * inv_dy20)); | |
} | |
if (dy01_dy20 < 0 && iv[2] >= 0) { | |
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)iv[2] * inv_dy01_dy20)); | |
} else if (dy01_dy20 > 0 && iv[2] < 0) { | |
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)iv[2] * inv_dy01_dy20)); | |
} | |
} | |
__m128i imm_iu = _mm_add_epi32(imm_iv, _mm_set_epi32(0, ixmin * dy01_dy20, -ixmin * dy20, -ixmin * dy01)); | |
for (int32_t ix = ixmin; ix <= ixmax; ++ix) { | |
assert(_mm_movemask_ps(_mm_castsi128_ps(imm_iu)) == 0); | |
const __m128 xmm_bc = _mm_mul_ps(_mm_cvtepi32_ps(imm_iu), xmm_inv_area); | |
const __m128 xmm_bc0 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bc1 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_bc2 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, xmm_bc2); | |
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, xmm_bc1); | |
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, xmm_bc0); | |
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled); | |
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero); | |
_mm_storeu_si32(&framebufferRow[ix], imm_c); | |
imm_iu = _mm_add_epi32(imm_iu, imm_diu); | |
} | |
imm_iv = _mm_add_epi32(imm_iv, imm_div); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8)) | |
static inline __m128i _mm_mullo_epi32_SSE2(const __m128i a, const __m128i b) | |
{ | |
__m128i tmp1 = _mm_mul_epu32(a, b); | |
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); | |
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); | |
} | |
// Same as swrDrawTriangle_5() but the inner loop has been unrolled to 4 pixels | |
// per iteration (plus handling of remainder pixels). | |
// | |
// avg: 1.17ms, min: 1.10ms, max: 1.23ms | |
static void swrDrawTriangle_6(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0); | |
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the next set of pixels | |
const __m128 xmm_x_duv4_duv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20; | |
const float inv_dy20 = 1.0f / (float)dy20; | |
const float inv_dy01 = 1.0f / (float)dy01; | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*)& row_uvw_[0], imm_row_uvw_); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[0] * inv_dy01)); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[0] * inv_dy01)); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[1] * inv_dy20)); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[1] * inv_dy20)); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)row_uvw_[2] * inv_dy01_dy20)); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)row_uvw_[2] * inv_dy01_dy20)); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
__m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
// Calculate color of each pixel | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v))); | |
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v))); | |
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
const __m128i imm_c_p1 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p1), imm_zero), imm_zero); | |
const __m128i imm_c_p2 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p2), imm_zero), imm_zero); | |
const __m128i imm_c_p3 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p3), imm_zero), imm_zero); | |
// Pack the 4 colors into a XMM registers and store into framebuffer | |
const __m128i imm_c_p0011 = _mm_shuffle_si128(imm_c_p0, imm_c_p1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128i imm_c_p2233 = _mm_shuffle_si128(imm_c_p2, imm_c_p3, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128i imm_c_p0123 = _mm_shuffle_si128(imm_c_p0011, imm_c_p2233, _MM_SHUFFLE(2, 0, 2, 0)); | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_c_p0123); | |
// Move on to the next set of pixels | |
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv4_duv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duvw_1); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duvw_1); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duvw_1); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// Same as swrDrawTriangle_6() but the inner loop increments the | |
// final 4 pixel barycentric coords (8 regs) directly. | |
// | |
// avg: 1.11ms, min: 1.06ms, max: 1.21ms | |
static void swrDrawTriangle_7(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0); | |
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20; | |
const float inv_dy20 = 1.0f / (float)dy20; | |
const float inv_dy01 = 1.0f / (float)dy01; | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*)& row_uvw_[0], imm_row_uvw_); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[0] * inv_dy01)); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[0] * inv_dy01)); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[1] * inv_dy20)); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[1] * inv_dy20)); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)row_uvw_[2] * inv_dy01_dy20)); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)row_uvw_[2] * inv_dy01_dy20)); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
__m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
__m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
__m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
__m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
__m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
__m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
__m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate color of each pixel | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v))); | |
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v))); | |
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
const __m128i imm_c_p1 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p1), imm_zero), imm_zero); | |
const __m128i imm_c_p2 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p2), imm_zero), imm_zero); | |
const __m128i imm_c_p3 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p3), imm_zero), imm_zero); | |
// Pack the 4 colors into a XMM registers and store into framebuffer | |
const __m128i imm_c_p0011 = _mm_shuffle_si128(imm_c_p0, imm_c_p1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128i imm_c_p2233 = _mm_shuffle_si128(imm_c_p2, imm_c_p3, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128i imm_c_p0123 = _mm_shuffle_si128(imm_c_p0011, imm_c_p2233, _MM_SHUFFLE(2, 0, 2, 0)); | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_c_p0123); | |
// Move on to the next set of pixels | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du4); | |
xmm_p1u = _mm_add_ps(xmm_p1u, xmm_x_du4); | |
xmm_p2u = _mm_add_ps(xmm_p2u, xmm_x_du4); | |
xmm_p3u = _mm_add_ps(xmm_p3u, xmm_x_du4); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv4); | |
xmm_p1v = _mm_add_ps(xmm_p1v, xmm_x_dv4); | |
xmm_p2v = _mm_add_ps(xmm_p2v, xmm_x_dv4); | |
xmm_p3v = _mm_add_ps(xmm_p3v, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// http://dss.stephanierct.com/DevBlog/?p=8 | |
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
static inline __m128 _mm_floor_ps_SSE2(__m128 x) | |
{ | |
#if 1 | |
__m128 j = _mm_load_ps(&xmm_ones[0]); | |
#else | |
__m128i v0 = _mm_setzero_si128(); | |
__m128i v1 = _mm_cmpeq_epi32(v0, v0); | |
__m128i ji = _mm_srli_epi32(v1, 25); | |
__m128 j = _mm_castsi128_ps(_mm_slli_epi32(ji, 23)); //create vector 1.0f | |
#endif | |
__m128i i = _mm_cvttps_epi32(x); | |
__m128 fi = _mm_cvtepi32_ps(i); | |
__m128 igx = _mm_cmpgt_ps(fi, x); | |
j = _mm_and_ps(igx, j); | |
return _mm_sub_ps(fi, j); | |
} | |
static inline __m128 _mm_ceil_ps_SSE2(__m128 x) | |
{ | |
#if 1 | |
__m128 j = _mm_load_ps(&xmm_ones[0]); | |
#else | |
__m128i v0 = _mm_setzero_si128(); | |
__m128i v1 = _mm_cmpeq_epi32(v0, v0); | |
__m128i ji = _mm_srli_epi32(v1, 25); | |
__m128 j = _mm_castsi128_ps(_mm_slli_epi32(ji, 23)); //create vector 1.0f | |
#endif | |
__m128i i = _mm_cvttps_epi32(x); | |
__m128 fi = _mm_cvtepi32_ps(i); | |
__m128 igx = _mm_cmplt_ps(fi, x); | |
j = _mm_and_ps(igx, j); | |
return _mm_add_ps(fi, j); | |
} | |
// Same as swrDrawTriangle_7() but with floorf() and ceilf() function replaced by | |
// _mm_floor_ps_SSE2() and _mm_ceil_ps_SSE2() | |
// | |
// avg: 1.03ms, min: 0.98ms, max: 1.13ms | |
static void swrDrawTriangle_8(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0); | |
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
__m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
__m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
__m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
__m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
__m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
__m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
__m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate color of each pixel | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v))); | |
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v))); | |
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
const __m128i imm_c_p1 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p1), imm_zero), imm_zero); | |
const __m128i imm_c_p2 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p2), imm_zero), imm_zero); | |
const __m128i imm_c_p3 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p3), imm_zero), imm_zero); | |
// Pack the 4 colors into a XMM registers and store into framebuffer | |
const __m128i imm_c_p0011 = _mm_shuffle_si128(imm_c_p0, imm_c_p1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128i imm_c_p2233 = _mm_shuffle_si128(imm_c_p2, imm_c_p3, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128i imm_c_p0123 = _mm_shuffle_si128(imm_c_p0011, imm_c_p2233, _MM_SHUFFLE(2, 0, 2, 0)); | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_c_p0123); | |
// Move on to the next set of pixels | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du4); | |
xmm_p1u = _mm_add_ps(xmm_p1u, xmm_x_du4); | |
xmm_p2u = _mm_add_ps(xmm_p2u, xmm_x_du4); | |
xmm_p3u = _mm_add_ps(xmm_p3u, xmm_x_du4); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv4); | |
xmm_p1v = _mm_add_ps(xmm_p1v, xmm_x_dv4); | |
xmm_p2v = _mm_add_ps(xmm_p2v, xmm_x_dv4); | |
xmm_p3v = _mm_add_ps(xmm_p3v, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// Similar to swrDrawTriangle_8() except from the way the 4 pixel colors are packed into 4 DWORDs. | |
// | |
// avg: 0.90ms, min: 0.86ms, max: 1.00ms | |
static void swrDrawTriangle_9(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0); | |
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
__m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
__m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
__m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
__m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0)); | |
__m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1)); | |
__m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2)); | |
__m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v))); | |
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v))); | |
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v))); | |
// Convert to uint32_t | |
const __m128i imm_rgba_p0_u32 = _mm_cvtps_epi32(xmm_c_p0); | |
const __m128i imm_rgba_p1_u32 = _mm_cvtps_epi32(xmm_c_p1); | |
const __m128i imm_rgba_p2_u32 = _mm_cvtps_epi32(xmm_c_p2); | |
const __m128i imm_rgba_p3_u32 = _mm_cvtps_epi32(xmm_c_p3); | |
// Pack into uint16_t | |
const __m128i imm_rgba_p01_u16 = _mm_packs_epi32(imm_rgba_p0_u32, imm_rgba_p1_u32); | |
const __m128i imm_rgba_p23_u16 = _mm_packs_epi32(imm_rgba_p2_u32, imm_rgba_p3_u32); | |
// Pack into uint8_t | |
const __m128i imm_rgba_p0123_u8 = _mm_packus_epi16(imm_rgba_p01_u16, imm_rgba_p23_u16); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du4); | |
xmm_p1u = _mm_add_ps(xmm_p1u, xmm_x_du4); | |
xmm_p2u = _mm_add_ps(xmm_p2u, xmm_x_du4); | |
xmm_p3u = _mm_add_ps(xmm_p3u, xmm_x_du4); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv4); | |
xmm_p1v = _mm_add_ps(xmm_p1v, xmm_x_dv4); | |
xmm_p2v = _mm_add_ps(xmm_p2v, xmm_x_dv4); | |
xmm_p3v = _mm_add_ps(xmm_p3v, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v))); | |
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_c_p0); | |
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du); | |
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// Similar to swrDrawTriangle_9() but each of the RGBA channels is treated as a separate attribute | |
// to be interpolated over the triangle. | |
// | |
// avg: 0.94ms, min: 0.90ms, max: 1.01ms | |
static void swrDrawTriangle(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0); | |
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0); | |
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3)); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register | |
// _mm_shuffle_epi8() with SSE2 | |
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF); | |
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 } | |
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask), | |
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8) | |
); | |
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 } | |
const __m128i imm_rgba_p0123_u8 = | |
_mm_packus_epi16( | |
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask), | |
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8) | |
); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4); | |
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// Same as swrDrawTriangle() but with SSSE3 _mm_shuffle_epi8 | |
// | |
// avg: 0.85ms, min: 0.80ms, max: 0.91ms | |
static void swrDrawTriangle_SSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0); | |
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0); | |
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3)); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// Shuffle into RGBA uint32_t | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4); | |
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
// Same as swrDrawTriangle() but uses SSE 4.1 instructions for floorf(), ceilf() and mullo_epi32() | |
// | |
// avg: 0.82ms, min: 0.78ms, max: 0.94ms | |
static void swrDrawTriangle_SSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2) | |
{ | |
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0); | |
if (iarea == 0) { | |
// Degenerate triangle with 0 area. | |
return; | |
} else if (iarea < 0) { | |
// Swap (x1, y1) <-> (x2, y2) | |
{ int32_t tmp = x1; x1 = x2; x2 = tmp; } | |
{ int32_t tmp = y1; y1 = y2; y2 = tmp; } | |
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; } | |
iarea = -iarea; | |
} | |
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0); | |
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0); | |
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1); | |
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1); | |
const int32_t bboxWidth = bboxMaxX - bboxMinX; | |
const int32_t bboxHeight = bboxMaxY - bboxMinY; | |
const __m128i imm_zero = _mm_setzero_si128(); | |
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero)); | |
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero)); | |
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero)); | |
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0); | |
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0); | |
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3)); | |
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2)); | |
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3)); | |
const int32_t dy01 = y0 - y1; | |
const int32_t dx01 = x0 - x1; | |
const int32_t dx20 = x2 - x0; | |
const int32_t dy20 = y2 - y0; | |
const int32_t dy01_dy20 = dy01 + dy20; | |
const __m128 xmm_zero = _mm_setzero_ps(); | |
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea); | |
// Barycentric coordinate deltas for the X direction | |
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01); | |
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area); | |
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1); | |
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2); | |
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2); | |
// UV deltas for the 1st and 2nd pixel | |
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0)); | |
// UV deltas for the 3rd and 4th pixel | |
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1)); | |
// Barycentric coordinate deltas for the Y direction | |
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01); | |
// Calculate unnormalized barycentric coordinates of the bounding box min. | |
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01; | |
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20; | |
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v; | |
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u); | |
// | |
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01); | |
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width]; | |
for (int32_t iy = 0; iy <= bboxHeight; ++iy) { | |
int32_t ixmin = 0; | |
int32_t ixmax = (uint32_t)bboxWidth; | |
// Calculate ixmin and ixmax | |
{ | |
int32_t row_uvw_[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_); | |
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale); | |
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps(xmm_row_uvw_)); | |
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps(xmm_row_uvw_)); | |
int32_t row_uvw_floor[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor); | |
int32_t row_uvw_ceil[4]; | |
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil); | |
if (dy01 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[0]); | |
} else if (row_uvw_[0] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]); | |
} | |
if (dy20 > 0) { | |
ixmax = swr_mini(ixmax, row_uvw_floor[1]); | |
} else if (row_uvw_[1] != 0) { | |
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]); | |
} | |
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) { | |
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]); | |
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) { | |
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]); | |
} | |
} | |
if (ixmin <= ixmax) { | |
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels. | |
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32(_mm_set1_epi32(ixmin), imm_x_duvw_)); | |
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area); | |
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0)); | |
// Calculate barycentric coordinates for the 4 pixels. | |
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels | |
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels | |
// Extract barycentric coordinates for each pixel | |
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0)); | |
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1)); | |
uint32_t* frameBuffer = &framebufferRow[ixmin]; | |
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1); | |
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration | |
for (uint32_t iIter = 0; iIter < numIter; ++iIter) { | |
// Calculate the color of each pixel | |
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123))); | |
// Pack into uint8_t | |
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 } | |
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16( | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)), | |
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123)) | |
); | |
// Shuffle into RGBA uint32_t | |
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | |
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask); | |
// Store | |
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8); | |
// Move on to the next set of pixels | |
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4); | |
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4); | |
frameBuffer += 4; | |
} | |
// Handle the remainder of pixels for this row | |
const uint32_t rem = numPixels & 3; | |
switch (rem) { | |
case 3: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 2: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
frameBuffer++; | |
} // fallthrough | |
case 1: { | |
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123))); | |
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123))); | |
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123))); | |
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123))); | |
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero); | |
_mm_storeu_si32(frameBuffer, imm_rgba_p0); | |
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du); | |
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv); | |
} // fallthrough | |
case 0: | |
default: | |
break; | |
} | |
} | |
// Move on to the next row of pixels. | |
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_); | |
framebufferRow += ctx->m_Width; | |
} | |
} | |
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color) | |
{ | |
end = end != NULL | |
? end | |
: str + strlen(str) | |
; | |
const int32_t chw = (int32_t)font->m_CharWidth; | |
const int32_t chh = (int32_t)font->m_CharHeight; | |
const uint8_t* chdata = font->m_CharData; | |
int32_t x = x0; | |
int32_t y = y0; | |
while (str != end) { | |
char ch = *str; | |
if (ch < font->m_CharMin || ch > font->m_CharMax) { | |
ch = font->m_MissingCharFallbackID; | |
} | |
const uint8_t chID = (uint8_t)ch - font->m_CharMin; | |
const uint8_t* charData = &chdata[chID * chh]; | |
for (int32_t chy = 0;chy < chh;++chy) { | |
const uint8_t chrow = charData[chy]; | |
for (int32_t chx = 0; chx < chw; ++chx) { | |
if ((chrow & (1u << chx)) != 0) { | |
swrDrawPixel(ctx, x + chx, y + chy, color); | |
} | |
} | |
} | |
x += chw; | |
++str; | |
} | |
} | |
static void movAvgPush(moving_averaged* avg, double val) | |
{ | |
const uint32_t id = avg->m_NextItemID; | |
avg->m_NumItems++; | |
if (avg->m_NumItems >= 128) { | |
avg->m_NumItems = 128; | |
} | |
avg->m_NextItemID = (avg->m_NextItemID + 1) % avg->m_NumItems; | |
avg->m_Value[id] = val; | |
} | |
static double movAvgGetAverage(const moving_averaged* avg) | |
{ | |
const uint32_t n = avg->m_NumItems; | |
double sum = 0.0; | |
for (uint32_t i = 0; i < n; ++i) { | |
sum += avg->m_Value[i]; | |
} | |
return sum / (double)n; | |
} | |
static void movAvgGetMinMax(const moving_averaged* avg, double* tmin, double* tmax) | |
{ | |
const uint32_t n = avg->m_NumItems; | |
double minT = avg->m_Value[0]; | |
double maxT = avg->m_Value[0]; | |
for (uint32_t i = 1; i < n; ++i) { | |
minT = avg->m_Value[i] < minT ? avg->m_Value[i] : minT; | |
maxT = avg->m_Value[i] > maxT ? avg->m_Value[i] : maxT; | |
} | |
*tmin = minT; | |
*tmax = maxT; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment