Skip to content

Instantly share code, notes, and snippets.

@jdryg
Created December 16, 2022 08:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdryg/725e408b948ed1d0c02f675e825c7e97 to your computer and use it in GitHub Desktop.
Save jdryg/725e408b948ed1d0c02f675e825c7e97 to your computer and use it in GitHub Desktop.
Triangle rasterization experiments
/**
* 8x8 monochrome bitmap fonts for rendering
* Author: Daniel Hepper <daniel@hepper.net>
*
* License: Public Domain
*
* Based on:
* // Summary: font8x8.h
* // 8x8 monochrome bitmap fonts for rendering
* //
* // Author:
* // Marcel Sondaar
* // International Business Machines (public domain VGA fonts)
* //
* // License:
* // Public Domain
*
* Fetched from: http://dimensionalrift.homelinux.net/combuster/mos3/?p=viewsource&file=/modules/gfx/font8_8.asm
**/
#include <stdint.h>
// Constant: font8x8_basic
// Contains an 8x8 font map for unicode points U+0000 - U+007F (basic latin)
uint8_t font8x8_basic[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0000 (nul)
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0001
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0002
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0003
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0004
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0005
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0006
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0007
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0008
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0009
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000A
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000B
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000C
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000D
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000E
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+000F
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0010
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0011
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0012
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0013
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0014
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0015
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0016
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0017
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0018
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0019
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001A
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001B
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001C
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001D
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001E
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+001F
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0020 (space)
0x18, 0x3C, 0x3C, 0x18, 0x18, 0x00, 0x18, 0x00, // U+0021 (!)
0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0022 (")
0x36, 0x36, 0x7F, 0x36, 0x7F, 0x36, 0x36, 0x00, // U+0023 (#)
0x0C, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x0C, 0x00, // U+0024 ($)
0x00, 0x63, 0x33, 0x18, 0x0C, 0x66, 0x63, 0x00, // U+0025 (%)
0x1C, 0x36, 0x1C, 0x6E, 0x3B, 0x33, 0x6E, 0x00, // U+0026 (&)
0x06, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0027 (')
0x18, 0x0C, 0x06, 0x06, 0x06, 0x0C, 0x18, 0x00, // U+0028 (()
0x06, 0x0C, 0x18, 0x18, 0x18, 0x0C, 0x06, 0x00, // U+0029 ())
0x00, 0x66, 0x3C, 0xFF, 0x3C, 0x66, 0x00, 0x00, // U+002A (*)
0x00, 0x0C, 0x0C, 0x3F, 0x0C, 0x0C, 0x00, 0x00, // U+002B (+)
0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x06, // U+002C (,)
0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, // U+002D (-)
0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x00, // U+002E (.)
0x60, 0x30, 0x18, 0x0C, 0x06, 0x03, 0x01, 0x00, // U+002F (/)
0x3E, 0x63, 0x73, 0x7B, 0x6F, 0x67, 0x3E, 0x00, // U+0030 (0)
0x0C, 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x3F, 0x00, // U+0031 (1)
0x1E, 0x33, 0x30, 0x1C, 0x06, 0x33, 0x3F, 0x00, // U+0032 (2)
0x1E, 0x33, 0x30, 0x1C, 0x30, 0x33, 0x1E, 0x00, // U+0033 (3)
0x38, 0x3C, 0x36, 0x33, 0x7F, 0x30, 0x78, 0x00, // U+0034 (4)
0x3F, 0x03, 0x1F, 0x30, 0x30, 0x33, 0x1E, 0x00, // U+0035 (5)
0x1C, 0x06, 0x03, 0x1F, 0x33, 0x33, 0x1E, 0x00, // U+0036 (6)
0x3F, 0x33, 0x30, 0x18, 0x0C, 0x0C, 0x0C, 0x00, // U+0037 (7)
0x1E, 0x33, 0x33, 0x1E, 0x33, 0x33, 0x1E, 0x00, // U+0038 (8)
0x1E, 0x33, 0x33, 0x3E, 0x30, 0x18, 0x0E, 0x00, // U+0039 (9)
0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x00, // U+003A (:)
0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x06, // U+003B (//)
0x18, 0x0C, 0x06, 0x03, 0x06, 0x0C, 0x18, 0x00, // U+003C (<)
0x00, 0x00, 0x3F, 0x00, 0x00, 0x3F, 0x00, 0x00, // U+003D (=)
0x06, 0x0C, 0x18, 0x30, 0x18, 0x0C, 0x06, 0x00, // U+003E (>)
0x1E, 0x33, 0x30, 0x18, 0x0C, 0x00, 0x0C, 0x00, // U+003F (?)
0x3E, 0x63, 0x7B, 0x7B, 0x7B, 0x03, 0x1E, 0x00, // U+0040 (@)
0x0C, 0x1E, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x00, // U+0041 (A)
0x3F, 0x66, 0x66, 0x3E, 0x66, 0x66, 0x3F, 0x00, // U+0042 (B)
0x3C, 0x66, 0x03, 0x03, 0x03, 0x66, 0x3C, 0x00, // U+0043 (C)
0x1F, 0x36, 0x66, 0x66, 0x66, 0x36, 0x1F, 0x00, // U+0044 (D)
0x7F, 0x46, 0x16, 0x1E, 0x16, 0x46, 0x7F, 0x00, // U+0045 (E)
0x7F, 0x46, 0x16, 0x1E, 0x16, 0x06, 0x0F, 0x00, // U+0046 (F)
0x3C, 0x66, 0x03, 0x03, 0x73, 0x66, 0x7C, 0x00, // U+0047 (G)
0x33, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x33, 0x00, // U+0048 (H)
0x1E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+0049 (I)
0x78, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, 0x00, // U+004A (J)
0x67, 0x66, 0x36, 0x1E, 0x36, 0x66, 0x67, 0x00, // U+004B (K)
0x0F, 0x06, 0x06, 0x06, 0x46, 0x66, 0x7F, 0x00, // U+004C (L)
0x63, 0x77, 0x7F, 0x7F, 0x6B, 0x63, 0x63, 0x00, // U+004D (M)
0x63, 0x67, 0x6F, 0x7B, 0x73, 0x63, 0x63, 0x00, // U+004E (N)
0x1C, 0x36, 0x63, 0x63, 0x63, 0x36, 0x1C, 0x00, // U+004F (O)
0x3F, 0x66, 0x66, 0x3E, 0x06, 0x06, 0x0F, 0x00, // U+0050 (P)
0x1E, 0x33, 0x33, 0x33, 0x3B, 0x1E, 0x38, 0x00, // U+0051 (Q)
0x3F, 0x66, 0x66, 0x3E, 0x36, 0x66, 0x67, 0x00, // U+0052 (R)
0x1E, 0x33, 0x07, 0x0E, 0x38, 0x33, 0x1E, 0x00, // U+0053 (S)
0x3F, 0x2D, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+0054 (T)
0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x3F, 0x00, // U+0055 (U)
0x33, 0x33, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00, // U+0056 (V)
0x63, 0x63, 0x63, 0x6B, 0x7F, 0x77, 0x63, 0x00, // U+0057 (W)
0x63, 0x63, 0x36, 0x1C, 0x1C, 0x36, 0x63, 0x00, // U+0058 (X)
0x33, 0x33, 0x33, 0x1E, 0x0C, 0x0C, 0x1E, 0x00, // U+0059 (Y)
0x7F, 0x63, 0x31, 0x18, 0x4C, 0x66, 0x7F, 0x00, // U+005A (Z)
0x1E, 0x06, 0x06, 0x06, 0x06, 0x06, 0x1E, 0x00, // U+005B ([)
0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0x40, 0x00, // U+005C (\)
0x1E, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1E, 0x00, // U+005D (])
0x08, 0x1C, 0x36, 0x63, 0x00, 0x00, 0x00, 0x00, // U+005E (^)
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, // U+005F (_)
0x0C, 0x0C, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, // U+0060 (`)
0x00, 0x00, 0x1E, 0x30, 0x3E, 0x33, 0x6E, 0x00, // U+0061 (a)
0x07, 0x06, 0x06, 0x3E, 0x66, 0x66, 0x3B, 0x00, // U+0062 (b)
0x00, 0x00, 0x1E, 0x33, 0x03, 0x33, 0x1E, 0x00, // U+0063 (c)
0x38, 0x30, 0x30, 0x3e, 0x33, 0x33, 0x6E, 0x00, // U+0064 (d)
0x00, 0x00, 0x1E, 0x33, 0x3f, 0x03, 0x1E, 0x00, // U+0065 (e)
0x1C, 0x36, 0x06, 0x0f, 0x06, 0x06, 0x0F, 0x00, // U+0066 (f)
0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x1F, // U+0067 (g)
0x07, 0x06, 0x36, 0x6E, 0x66, 0x66, 0x67, 0x00, // U+0068 (h)
0x0C, 0x00, 0x0E, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+0069 (i)
0x30, 0x00, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, // U+006A (j)
0x07, 0x06, 0x66, 0x36, 0x1E, 0x36, 0x67, 0x00, // U+006B (k)
0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00, // U+006C (l)
0x00, 0x00, 0x33, 0x7F, 0x7F, 0x6B, 0x63, 0x00, // U+006D (m)
0x00, 0x00, 0x1F, 0x33, 0x33, 0x33, 0x33, 0x00, // U+006E (n)
0x00, 0x00, 0x1E, 0x33, 0x33, 0x33, 0x1E, 0x00, // U+006F (o)
0x00, 0x00, 0x3B, 0x66, 0x66, 0x3E, 0x06, 0x0F, // U+0070 (p)
0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x78, // U+0071 (q)
0x00, 0x00, 0x3B, 0x6E, 0x66, 0x06, 0x0F, 0x00, // U+0072 (r)
0x00, 0x00, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x00, // U+0073 (s)
0x08, 0x0C, 0x3E, 0x0C, 0x0C, 0x2C, 0x18, 0x00, // U+0074 (t)
0x00, 0x00, 0x33, 0x33, 0x33, 0x33, 0x6E, 0x00, // U+0075 (u)
0x00, 0x00, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00, // U+0076 (v)
0x00, 0x00, 0x63, 0x6B, 0x7F, 0x7F, 0x36, 0x00, // U+0077 (w)
0x00, 0x00, 0x63, 0x36, 0x1C, 0x36, 0x63, 0x00, // U+0078 (x)
0x00, 0x00, 0x33, 0x33, 0x33, 0x3E, 0x30, 0x1F, // U+0079 (y)
0x00, 0x00, 0x3F, 0x19, 0x0C, 0x26, 0x3F, 0x00, // U+007A (z)
0x38, 0x0C, 0x0C, 0x07, 0x0C, 0x0C, 0x38, 0x00, // U+007B ({)
0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00, // U+007C (|)
0x07, 0x0C, 0x0C, 0x38, 0x0C, 0x0C, 0x07, 0x00, // U+007D (})
0x6E, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // U+007E (~)
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 // U+007F
};
#define _CRT_SECURE_NO_WARNINGS
#include "minifb/MiniFB.h"
#include "font8x8_basic.h"
#include <stdint.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <immintrin.h>
#include <assert.h>
#define FRAMEBUFFER_CONFIG_WIDTH 1024
#define FRAMEBUFFER_CONFIG_HEIGHT 1024
#define SWR_COLOR_FORMAT_RGBA 0
#define SWR_COLOR_FORMAT_BGRA 1
#ifndef SWR_FRAMEBUFFER_FORMAT
#define SWR_FRAMEBUFFER_FORMAT SWR_COLOR_FORMAT_BGRA
#endif
#if SWR_FRAMEBUFFER_FORMAT == SWR_COLOR_FORMAT_BGRA
#define SWR_COLOR_BLUE_Pos 0
#define SWR_COLOR_GREEN_Pos 8
#define SWR_COLOR_RED_Pos 16
#define SWR_COLOR_ALPHA_Pos 24
#else
#define SWR_COLOR_RED_Pos 0
#define SWR_COLOR_GREEN_Pos 8
#define SWR_COLOR_BLUE_Pos 16
#define SWR_COLOR_ALPHA_Pos 24
#endif
#define SWR_COLOR_RED_Msk (0xFF << SWR_COLOR_RED_Pos)
#define SWR_COLOR_GREEN_Msk (0xFF << SWR_COLOR_GREEN_Pos)
#define SWR_COLOR_BLUE_Msk (0xFF << SWR_COLOR_BLUE_Pos)
#define SWR_COLOR_ALPHA_Msk (0xFF << SWR_COLOR_ALPHA_Pos)
#define SWR_COLOR(r, g, b, a) (0 \
| (((r) << SWR_COLOR_RED_Pos) & SWR_COLOR_RED_Msk) \
| (((g) << SWR_COLOR_GREEN_Pos) & SWR_COLOR_GREEN_Msk) \
| (((b) << SWR_COLOR_BLUE_Pos) & SWR_COLOR_BLUE_Msk) \
| (((a) << SWR_COLOR_ALPHA_Pos) & SWR_COLOR_ALPHA_Msk))
#define SWR_COLOR_BLACK SWR_COLOR(0, 0, 0, 255)
#define SWR_COLOR_RED SWR_COLOR(255, 0, 0, 255)
#define SWR_COLOR_GREEN SWR_COLOR(0, 255, 0, 255)
#define SWR_COLOR_BLUE SWR_COLOR(0, 0, 255, 255)
#define SWR_COLOR_YELLOW SWR_COLOR(255, 255, 0, 255)
#define SWR_COLOR_WHITE SWR_COLOR(255, 255, 255, 255)
typedef struct vec2f
{
float x;
float y;
} vec2f;
static const vec2f s_Points[] = {
{ .x = 0.00000000f, .y = 0.00000000f },
{ .x = 0.00000000f, .y = 5.07142878f },
{ .x = 5.07142878f, .y = 0.00000000f },
{ .x = 0.00000000f, .y = -5.07142878f },
{ .x = -5.07142830f, .y = 0.00000000f },
{ .x = 3.88150382f, .y = 9.37077808f },
{ .x = 9.37077904f, .y = 3.88150382f },
{ .x = 9.37077904f, .y = -3.88150287f },
{ .x = 3.88150501f, .y = -9.37077808f },
{ .x = -3.88150501f, .y = -9.37077713f },
{ .x = -9.37077713f, .y = -3.88150549f },
{ .x = -9.37077904f, .y = 3.88150501f },
{ .x = -3.88150144f, .y = 9.37077999f },
{ .x = 0.00000000f, .y = 15.2142859f },
{ .x = 7.60714293f, .y = 13.1759577f },
{ .x = 13.1759586f, .y = 7.60714197f },
{ .x = 15.2142859f, .y = 0.00000000f },
{ .x = 13.1759577f, .y = -7.60714293f },
{ .x = 7.60714102f, .y = -13.1759586f },
{ .x = 0.00000000f, .y = -15.2142859f },
{ .x = -7.60714293f, .y = -13.1759567f },
{ .x = -13.1759567f, .y = -7.60714197f },
{ .x = -15.2142849f, .y = 0.00000000f },
{ .x = -13.1759567f, .y = 7.60714769f },
{ .x = -7.60714197f, .y = 13.1759596f },
{ .x = 3.95754576f, .y = 19.8959293f },
{ .x = 11.2701397f, .y = 16.8669548f },
{ .x = 16.8669548f, .y = 11.2701397f },
{ .x = 19.8959293f, .y = 3.95754743f },
{ .x = 19.8959293f, .y = -3.95754814f },
{ .x = 16.8669567f, .y = -11.2701368f },
{ .x = 11.2701387f, .y = -16.8669567f },
{ .x = 3.95754361f, .y = -19.8959293f },
{ .x = -3.95754337f, .y = -19.8959312f },
{ .x = -11.2701368f, .y = -16.8669548f },
{ .x = -16.8669567f, .y = -11.2701368f },
{ .x = -19.8959293f, .y = -3.95754004f },
{ .x = -19.8959312f, .y = 3.95754814f },
{ .x = -16.8669548f, .y = 11.2701435f },
{ .x = -11.2701445f, .y = 16.8669548f },
{ .x = -3.95754981f, .y = 19.8959293f },
{ .x = 0.00000000f, .y = 25.3571434f },
{ .x = 7.83578825f, .y = 24.1160774f },
{ .x = 14.9045563f, .y = 20.5143585f },
{ .x = 20.5143604f, .y = 14.9045544f },
{ .x = 24.1160755f, .y = 7.83578682f },
{ .x = 25.3571434f, .y = 0.00000000f },
{ .x = 24.1160774f, .y = -7.83578825f },
{ .x = 20.5143585f, .y = -14.9045582f },
{ .x = 14.9045544f, .y = -20.5143604f },
{ .x = 7.83579159f, .y = -24.1160755f },
{ .x = 0.00000000f, .y = -25.3571434f },
{ .x = -7.83578539f, .y = -24.1160774f },
{ .x = -14.9045534f, .y = -20.5143585f },
{ .x = -20.5143604f, .y = -14.9045515f },
{ .x = -24.1160793f, .y = -7.83578110f },
{ .x = -25.3571415f, .y = 0.00000000f },
{ .x = -24.1160774f, .y = 7.83579159f },
{ .x = -20.5143585f, .y = 14.9045610f },
{ .x = -14.9045620f, .y = 20.5143585f },
{ .x = -7.83579302f, .y = 24.1160755f },
{ .x = 0.00000000f, .y = 30.4285717f },
{ .x = 7.87549543f, .y = 29.3917427f },
{ .x = 15.2142859f, .y = 26.3519154f },
{ .x = 21.5162487f, .y = 21.5162468f },
{ .x = 26.3519173f, .y = 15.2142839f },
{ .x = 29.3917446f, .y = 7.87549210f },
{ .x = 30.4285717f, .y = 0.00000000f },
{ .x = 29.3917427f, .y = -7.87549543f },
{ .x = 26.3519154f, .y = -15.2142859f },
{ .x = 21.5162506f, .y = -21.5162487f },
{ .x = 15.2142820f, .y = -26.3519173f },
{ .x = 7.87549210f, .y = -29.3917446f },
{ .x = 0.00000000f, .y = -30.4285717f },
{ .x = -7.87549925f, .y = -29.3917408f },
{ .x = -15.2142859f, .y = -26.3519135f },
{ .x = -21.5162487f, .y = -21.5162525f },
{ .x = -26.3519135f, .y = -15.2142839f },
{ .x = -29.3917408f, .y = -7.87549400f },
{ .x = -30.4285698f, .y = 0.00000000f },
{ .x = -29.3917408f, .y = 7.87550640f },
{ .x = -26.3519135f, .y = 15.2142954f },
{ .x = -21.5162582f, .y = 21.5162449f },
{ .x = -15.2142839f, .y = 26.3519192f },
{ .x = -7.87549400f, .y = 29.3917446f },
{ .x = 0.00000000f, .y = 35.5000000f },
{ .x = 7.38086557f, .y = 34.7242393f },
{ .x = 14.4391518f, .y = 32.4308624f },
{ .x = 20.8663788f, .y = 28.7201023f },
{ .x = 26.3816433f, .y = 23.7541351f },
{ .x = 30.7439041f, .y = 17.7499981f },
{ .x = 33.7625046f, .y = 10.9701014f },
{ .x = 35.3055267f, .y = 3.71075916f },
{ .x = 35.3055267f, .y = -3.71076059f },
{ .x = 33.7625084f, .y = -10.9701033f },
{ .x = 30.7439022f, .y = -17.7500000f },
{ .x = 26.3816414f, .y = -23.7541370f },
{ .x = 20.8663769f, .y = -28.7201042f },
{ .x = 14.4391499f, .y = -32.4308662f },
{ .x = 7.38086414f, .y = -34.7242393f },
{ .x = 0.00000000f, .y = -35.5000000f },
{ .x = -7.38086557f, .y = -34.7242393f },
{ .x = -14.4391594f, .y = -32.4308624f },
{ .x = -20.8663750f, .y = -28.7201023f },
{ .x = -26.3816452f, .y = -23.7541351f },
{ .x = -30.7438984f, .y = -17.7499981f },
{ .x = -33.7625084f, .y = -10.9700928f },
{ .x = -33.8194656f, .y = -3.71075916f },
{ .x = -33.8194656f, .y = 3.71075630f },
{ .x = -33.7625084f, .y = 10.9701080f },
{ .x = -30.7438984f, .y = 17.7500114f },
{ .x = -26.3816433f, .y = 23.7541409f },
{ .x = -20.8663864f, .y = 28.7201023f },
{ .x = -14.4391537f, .y = 32.4308662f },
{ .x = -7.38085985f, .y = 34.7242432f },
};
static const uint32_t kNumPoints = sizeof(s_Points) / sizeof(s_Points[0]);
static const uint32_t s_Colors[] = {
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
0xffff00fe,
0xffff000e,
0xff00ff2b,
0xff0000ff,
0xffff6c00,
};
static const uint16_t s_Indices[] = {
107, 106, 78 ,
107, 78 , 79 ,
107, 79 , 108,
108, 79 , 80 ,
108, 80 , 109,
106, 105, 77 ,
106, 77 , 78 ,
109, 80 , 81 ,
109, 81 , 110,
105, 104, 77 ,
110, 81 , 111,
79 , 78 , 56 ,
79 , 56 , 80 ,
78 , 77 , 55 ,
78 , 55 , 56 ,
80 , 56 , 57 ,
80 , 57 , 81 ,
104, 103, 76 ,
104, 76 , 77 ,
111, 81 , 82 ,
111, 82 , 112,
77 , 76 , 54 ,
77 , 54 , 55 ,
81 , 57 , 58 ,
81 , 58 , 82 ,
56 , 55 , 36 ,
56 , 36 , 37 ,
56 , 37 , 57 ,
55 , 54 , 35 ,
55 , 35 , 36 ,
57 , 37 , 38 ,
57 , 38 , 58 ,
82 , 58 , 59 ,
82 , 59 , 83 ,
82 , 83 , 112,
76 , 103, 75 ,
76 , 75 , 53 ,
76 , 53 , 54 ,
112, 83 , 113,
103, 102, 75 ,
54 , 53 , 35 ,
58 , 38 , 39 ,
58 , 39 , 59 ,
37 , 36 , 22 ,
37 , 22 , 23 ,
37 , 23 , 38 ,
36 , 35 , 21 ,
36 , 21 , 22 ,
35 , 53 , 34 ,
35 , 34 , 21 ,
38 , 23 , 39 ,
75 , 102, 74 ,
75 , 74 , 52 ,
75 , 52 , 53 ,
22 , 21 , 10 ,
22 , 10 , 11 ,
22 , 11 , 23 ,
83 , 59 , 60 ,
83 , 60 , 84 ,
83 , 84 , 113,
59 , 39 , 60 ,
53 , 52 , 34 ,
102, 101, 74 ,
113, 84 , 114,
21 , 34 , 20 ,
21 , 20 , 10 ,
23 , 11 , 12 ,
23 , 12 , 24 ,
23 , 24 , 39 ,
39 , 24 , 40 ,
39 , 40 , 60 ,
34 , 52 , 33 ,
34 , 33 , 20 ,
11 , 10 , 4 ,
11 , 4 , 1 ,
11 , 1 , 12 ,
10 , 20 , 9 ,
10 , 9 , 3 ,
10 , 3 , 4 ,
74 , 101, 73 ,
74 , 73 , 51 ,
74 , 51 , 52 ,
84 , 60 , 41 ,
84 , 41 , 61 ,
84 , 61 , 114,
60 , 40 , 41 ,
52 , 51 , 33 ,
20 , 33 , 19 ,
20 , 19 , 9 ,
24 , 12 , 13 ,
24 , 13 , 40 ,
101, 100, 73 ,
114, 61 , 85 ,
4 , 3 , 0 ,
4 , 0 , 1 ,
40 , 13 , 25 ,
40 , 25 , 41 ,
33 , 51 , 32 ,
33 , 32 , 19 ,
9 , 19 , 8 ,
9 , 8 , 3 ,
12 , 1 , 5 ,
12 , 5 , 13 ,
100, 99 , 73 ,
73 , 99 , 72 ,
73 , 72 , 51 ,
51 , 72 , 50 ,
51 , 50 , 32 ,
19 , 32 , 18 ,
19 , 18 , 8 ,
3 , 8 , 2 ,
3 , 2 , 0 ,
0 , 2 , 1 ,
1 , 2 , 5 ,
13 , 5 , 14 ,
13 , 14 , 25 ,
41 , 25 , 42 ,
41 , 42 , 62 ,
41 , 62 , 61 ,
61 , 62 , 86 ,
61 , 86 , 85 ,
5 , 2 , 6 ,
5 , 6 , 14 ,
8 , 18 , 17 ,
8 , 17 , 7 ,
8 , 7 , 2 ,
32 , 50 , 31 ,
32 , 31 , 18 ,
25 , 14 , 26 ,
25 , 26 , 42 ,
2 , 7 , 6 ,
99 , 98 , 72 ,
86 , 62 , 87 ,
18 , 31 , 17 ,
14 , 6 , 15 ,
14 , 15 , 27 ,
14 , 27 , 26 ,
42 , 26 , 43 ,
42 , 43 , 63 ,
42 , 63 , 62 ,
50 , 72 , 71 ,
50 , 71 , 49 ,
50 , 49 , 31 ,
72 , 98 , 71 ,
62 , 63 , 87 ,
7 , 17 , 16 ,
7 , 16 , 6 ,
6 , 16 , 15 ,
31 , 49 , 48 ,
31 , 48 , 30 ,
31 , 30 , 17 ,
26 , 27 , 43 ,
17 , 30 , 29 ,
17 , 29 , 16 ,
15 , 16 , 28 ,
15 , 28 , 27 ,
98 , 97 , 71 ,
87 , 63 , 88 ,
49 , 71 , 70 ,
49 , 70 , 48 ,
43 , 27 , 44 ,
43 , 44 , 64 ,
43 , 64 , 63 ,
71 , 97 , 70 ,
16 , 29 , 28 ,
63 , 64 , 88 ,
27 , 28 , 45 ,
27 , 45 , 44 ,
30 , 48 , 47 ,
30 , 47 , 29 ,
29 , 47 , 46 ,
29 , 46 , 28 ,
28 , 46 , 45 ,
48 , 70 , 69 ,
48 , 69 , 47 ,
44 , 45 , 65 ,
44 , 65 , 64 ,
97 , 96 , 70 ,
88 , 64 , 89 ,
64 , 65 , 89 ,
70 , 96 , 69 ,
45 , 46 , 66 ,
45 , 66 , 65 ,
47 , 69 , 68 ,
47 , 68 , 46 ,
46 , 68 , 67 ,
46 , 67 , 66 ,
69 , 96 , 95 ,
69 , 95 , 94 ,
69 , 94 , 68 ,
65 , 66 , 91 ,
65 , 91 , 90 ,
65 , 90 , 89 ,
68 , 94 , 93 ,
68 , 93 , 67 ,
66 , 67 , 92 ,
66 , 92 , 91 ,
67 , 93 ,92 ,
};
static const uint32_t kNumIndices = sizeof(s_Indices) / sizeof(s_Indices[0]);
typedef struct swr_context
{
uint32_t* m_FrameBuffer;
uint32_t m_Width;
uint32_t m_Height;
} swr_context;
typedef struct swr_font
{
uint8_t* m_CharData;
uint32_t m_CharWidth;
uint32_t m_CharHeight;
uint8_t m_CharMin;
uint8_t m_CharMax;
uint8_t m_MissingCharFallbackID;
} swr_font;
static const swr_font* s_Font8x8 = &(swr_font){
.m_CharData = font8x8_basic,
.m_CharWidth = 8,
.m_CharHeight = 8,
.m_CharMin = 0,
.m_CharMax = 0x7f,
.m_MissingCharFallbackID = 0
};
static swr_context* swrCreateContext(uint32_t w, uint32_t h);
static void swrDestroyContext(swr_context* ctx);
static void swrClear(swr_context* ctx, uint32_t color);
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color);
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color);
static void swrDrawTriangle(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_1(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_5(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_6(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_7(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_8(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_9(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_SSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawTriangle_SSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2);
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x, int32_t y, const char* str, const char* end, uint32_t color);
typedef struct moving_averaged
{
double m_Value[128];
uint32_t m_NumItems;
uint32_t m_NextItemID;
} moving_averaged;
static void movAvgPush(moving_averaged* avg, double val);
static double movAvgGetAverage(const moving_averaged* avg);
static void movAvgGetMinMax(const moving_averaged* avg, double* tmin, double* tmax);
int32_t main(void)
{
swr_context* ctx = swrCreateContext(FRAMEBUFFER_CONFIG_WIDTH, FRAMEBUFFER_CONFIG_HEIGHT);
if (!ctx) {
return -1;
}
struct mfb_window* window = mfb_open_ex("swr", ctx->m_Width, ctx->m_Height, 0u);
if (!window) {
swrDestroyContext(ctx);
return -2;
}
struct mfb_timer* timer = mfb_timer_create();
// Calculate bounding rect of original mesh
vec2f bboxMin = s_Points[0];
vec2f bboxMax = s_Points[0];
for (uint32_t i = 1; i < kNumPoints; ++i) {
const vec2f* pt = &s_Points[i];
bboxMin.x = pt->x < bboxMin.x ? pt->x : bboxMin.x;
bboxMin.y = pt->y < bboxMin.y ? pt->y : bboxMin.y;
bboxMax.x = pt->x > bboxMax.x ? pt->x : bboxMax.x;
bboxMax.y = pt->y > bboxMax.y ? pt->y : bboxMax.y;
}
// Transform all points to image space...
vec2f* transformedPoints = (vec2f*)malloc(sizeof(vec2f) * kNumPoints);
for (uint32_t i = 0; i < kNumPoints; ++i) {
const vec2f* pt = &s_Points[i];
vec2f transPt;
transPt.y = (1.0f - ((pt->x - bboxMin.x) / (bboxMax.x - bboxMin.x))) * (ctx->m_Width - 64) + 32;
transPt.x = ((pt->y - bboxMin.y) / (bboxMax.y - bboxMin.y)) * (ctx->m_Height - 64) + 32;
transformedPoints[i] = transPt;
}
moving_averaged frameTimeAvg;
memset(&frameTimeAvg, 0, sizeof(moving_averaged));
do {
mfb_timer_reset(timer);
{
swrClear(ctx, SWR_COLOR_BLACK);
#if 1
const uint32_t numTris = kNumIndices / 3;
for (uint32_t i = 0; i < numTris; ++i) {
const uint16_t id0 = s_Indices[i * 3 + 0];
const uint16_t id1 = s_Indices[i * 3 + 1];
const uint16_t id2 = s_Indices[i * 3 + 2];
const vec2f* pt0 = &transformedPoints[id0];
const vec2f* pt1 = &transformedPoints[id1];
const vec2f* pt2 = &transformedPoints[id2];
swrDrawTriangle_SSE41(ctx
, (int32_t)pt0->x, (int32_t)pt0->y
, (int32_t)pt1->x, (int32_t)pt1->y
, (int32_t)pt2->x, (int32_t)pt2->y
, s_Colors[id0]
, s_Colors[id1]
, s_Colors[id2]
);
}
#else
swrDrawTriangle(ctx, -10, 80, 200, -10, 300, 200, SWR_COLOR_RED, SWR_COLOR_GREEN, SWR_COLOR_BLUE);
#endif
}
const double dt = mfb_timer_delta(timer);
movAvgPush(&frameTimeAvg, dt * 1000.0);
{
const double tAvg = movAvgGetAverage(&frameTimeAvg);
double tMin, tMax;
movAvgGetMinMax(&frameTimeAvg, &tMin, &tMax);
char str[256];
sprintf(str, "Frame Time: %.2fms (avg: %.2fms, min: %.2fms, max: %.2fms)", dt * 1000.0, tAvg, tMin, tMax);
swrDrawText(ctx, s_Font8x8, 8, 8, str, NULL, SWR_COLOR_WHITE);
}
int32_t state = mfb_update_ex(window, ctx->m_FrameBuffer, ctx->m_Width, ctx->m_Height);
if (state < 0) {
window = NULL;
break;
}
} while (mfb_wait_sync(window));
mfb_close(window);
swrDestroyContext(ctx);
return 0;
}
//////////////////////////////////////////////////////////////////////////
// SoftRast
//
static swr_context* swrCreateContext(uint32_t w, uint32_t h)
{
swr_context* ctx = (swr_context*)malloc(sizeof(swr_context));
if (!ctx) {
return NULL;
}
memset(ctx, 0, sizeof(swr_context));
ctx->m_FrameBuffer = (uint32_t*)malloc(sizeof(uint32_t) * (size_t)w * (size_t)h);
if (!ctx->m_FrameBuffer) {
swrDestroyContext(ctx);
return NULL;
}
memset(ctx->m_FrameBuffer, 0, sizeof(uint32_t) * (size_t)w * (size_t)h);
ctx->m_Width = w;
ctx->m_Height = h;
return ctx;
}
static void swrDestroyContext(swr_context* ctx)
{
free(ctx->m_FrameBuffer);
free(ctx);
}
static void swrClear(swr_context* ctx, uint32_t color)
{
uint32_t* buffer = ctx->m_FrameBuffer;
const uint32_t numPixels = ctx->m_Width * ctx->m_Height;
for (uint32_t i = 0; i < numPixels; ++i) {
*buffer++ = color;
}
}
static void swrDrawPixel(swr_context* ctx, int32_t x, int32_t y, uint32_t color)
{
if (x < 0 || x >= (int32_t)ctx->m_Width || y < 0 || y >= (int32_t)ctx->m_Height) {
return;
}
ctx->m_FrameBuffer[x + y * ctx->m_Width] = color;
}
static void swrDrawLine(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, uint32_t color)
{
bool steep = false;
if (abs(x0 - x1) < abs(y0 - y1)) {
{ int32_t tmp = x0; x0 = y0; y0 = tmp; }
{ int32_t tmp = x1; x1 = y1; y1 = tmp; }
steep = true;
}
if (x0 > x1) {
{ int32_t tmp = x0; x0 = x1; x1 = tmp; }
{ int32_t tmp = y0; y0 = y1; y1 = tmp; }
}
const int32_t dx = x1 - x0;
const int32_t derror2 = abs(y1 - y0) * 2;
const int32_t yinc = y1 > y0 ? 1 : -1;
int32_t error2 = 0;
int32_t y = y0;
if (steep) {
for (int32_t x = x0; x <= x1; x++) {
swrDrawPixel(ctx, y, x, color);
error2 += derror2;
if (error2 > dx) {
y += yinc;
error2 -= dx * 2;
}
}
} else {
for (int32_t x = x0; x <= x1; x++) {
swrDrawPixel(ctx, x, y, color);
error2 += derror2;
if (error2 > dx) {
y += yinc;
error2 -= dx * 2;
}
}
}
}
static inline int32_t swr_mini(int32_t a, int32_t b)
{
return a < b ? a : b;
}
static inline int32_t swr_maxi(int32_t a, int32_t b)
{
return a > b ? a : b;
}
static inline int32_t swr_min3i(int32_t a, int32_t b, int32_t c)
{
return swr_mini(a, swr_mini(b, c));
}
static inline int32_t swr_max3i(int32_t a, int32_t b, int32_t c)
{
return swr_maxi(a, swr_maxi(b, c));
}
static bool swr_calcBarycentricCoords(int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, int32_t x, int32_t y, float* bc)
{
const float dx20 = (float)(x2 - x0);
const float dx10 = (float)(x1 - x0);
const float dy20 = (float)(y2 - y0);
const float dy10 = (float)(y1 - y0);
const float uz = dx20 * dy10 - dx10 * dy20;
if (fabsf(uz) < 1.0f) {
return false;
}
const float dx0p = (float)(x0 - x);
const float dy0p = (float)(y0 - y);
const float ux = dx10 * dy0p - dx0p * dy10;
const float uy = dx0p * dy20 - dx20 * dy0p;
bc[0] = 1.0f - ((ux + uy) / uz);
bc[1] = uy / uz;
bc[2] = ux / uz;
return bc[0] >= 0.0f && bc[1] >= 0.0f && bc[2] >= 0.0f;
}
// avg: 8.82ms, min: 8.44ms, max: 9.28ms
static void swrDrawTriangle_1(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const uint32_t c0r = (color0 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c0g = (color0 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c0b = (color0 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c0a = (color0 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c1r = (color1 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c1g = (color1 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c1b = (color1 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c1a = (color1 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
const uint32_t c2r = (color2 & SWR_COLOR_RED_Msk) >> SWR_COLOR_RED_Pos;
const uint32_t c2g = (color2 & SWR_COLOR_GREEN_Msk) >> SWR_COLOR_GREEN_Pos;
const uint32_t c2b = (color2 & SWR_COLOR_BLUE_Msk) >> SWR_COLOR_BLUE_Pos;
const uint32_t c2a = (color2 & SWR_COLOR_ALPHA_Msk) >> SWR_COLOR_ALPHA_Pos;
for (int32_t y = bboxMinY; y <= bboxMaxY; ++y) {
const uint32_t y_width = (uint32_t)y * ctx->m_Width;
for (int32_t x = bboxMinX; x <= bboxMaxX; ++x) {
float barycentricCoords[3];
if (!swr_calcBarycentricCoords(x0, y0, x1, y1, x2, y2, x, y, &barycentricCoords[0])) {
continue;
}
const uint32_t cr = (uint32_t)(c0r * barycentricCoords[0] + c1r * barycentricCoords[1] + c2r * barycentricCoords[2]);
const uint32_t cg = (uint32_t)(c0g * barycentricCoords[0] + c1g * barycentricCoords[1] + c2g * barycentricCoords[2]);
const uint32_t cb = (uint32_t)(c0b * barycentricCoords[0] + c1b * barycentricCoords[1] + c2b * barycentricCoords[2]);
const uint32_t ca = (uint32_t)(c0a * barycentricCoords[0] + c1a * barycentricCoords[1] + c2a * barycentricCoords[2]);
swrDrawPixel(ctx, x, y, SWR_COLOR(cr, cg, cb, ca));
}
}
}
// avg: 4.95ms, min: 4.72ms, max: 5.24ms
static void swrDrawTriangle_2(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const __m128i xmm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero));
for (int32_t y = bboxMinY; y <= bboxMaxY; ++y) {
const uint32_t y_width = (uint32_t)y * ctx->m_Width;
for (int32_t x = bboxMinX; x <= bboxMaxX; ++x) {
float barycentricCoords[3];
if (!swr_calcBarycentricCoords(x0, y0, x1, y1, x2, y2, x, y, &barycentricCoords[0])) {
continue;
}
__m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, _mm_load1_ps(&barycentricCoords[0]));
__m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, _mm_load1_ps(&barycentricCoords[1]));
__m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, _mm_load1_ps(&barycentricCoords[2]));
__m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled);
__m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero);
_mm_storeu_si32(&ctx->m_FrameBuffer[x + y_width], imm_c);
}
}
}
// avg: 2.84ms, min: 2.67ms, max: 3.04ms
static void swrDrawTriangle_3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
const int32_t dx20 = x2 - x0;
const int32_t dx10 = x1 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy10 = y1 - y0;
const int32_t iuz = dx20 * dy10 - dx10 * dy20;
if (abs(iuz) < 1) {
return;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const __m128i xmm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero));
const float inv_uz = 1.0f / (float)iuz;
for (int32_t y = bboxMinY; y <= bboxMaxY; ++y) {
const int32_t dy0p = y0 - y;
const int32_t dx10_dy0p = dx10 * dy0p;
const int32_t dx20_dy0p = dx20 * dy0p;
uint32_t* framebufferRow = &ctx->m_FrameBuffer[y * ctx->m_Width];
for (int32_t x = bboxMinX; x <= bboxMaxX; ++x) {
const int32_t dx0p = x0 - x;
const int32_t iux = dx10_dy0p - dx0p * dy10;
const int32_t iuy = dx0p * dy20 - dx20_dy0p;
const float bcx = (float)iux * inv_uz;
const float bcy = (float)iuy * inv_uz;
const float bcz = 1.0f - (bcx + bcy);
if (bcz < 0.0f || bcy < 0.0f || bcx < 0.0f) {
continue;
}
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, _mm_load1_ps(&bcz));
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, _mm_load1_ps(&bcy));
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, _mm_load1_ps(&bcx));
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled);
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero);
_mm_storeu_si32(&framebufferRow[x], imm_c);
}
}
}
// avg: 1.66ms, min: 1.54ms, max: 1.99ms
static void swrDrawTriangle_4(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
const int32_t dx20 = x2 - x0;
const int32_t dx10 = x1 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy10 = y1 - y0;
const int32_t iarea = dx20 * dy10 - dx10 * dy20;
if (iarea == 0) {
return;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i xmm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero));
const float inv_area = 1.0f / (float)iarea;
const int32_t dy01 = -dy10;
const int32_t dx01 = -dx10;
const int32_t dy01_dy20 = dy01 + dy20;
const bool iarea_dy01_samesign = (iarea > 0 && dy01 > 0) || (iarea < 0 && dy01 < 0);
const bool iarea_dy20_samesign = (iarea > 0 && dy20 > 0) || (iarea < 0 && dy20 < 0);
const bool iarea_dy01dy20_samesign = (iarea > 0 && dy01_dy20 > 0) || (iarea < 0 && dy01_dy20 < 0);
int32_t ivx = ((x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01);
int32_t ivy = ((x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20);
int32_t ivz = iarea - ivx - ivy;
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
if (iarea_dy01_samesign) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)ivx / (float)dy01));
} else if (ivx != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)ivx / (float)dy01));
}
if (iarea_dy20_samesign) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)ivy / (float)dy20));
} else if (ivy != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)ivy / (float)dy20));
}
if ((iarea > 0 && dy01_dy20 < 0 && ivz >= 0) || (iarea < 0 && dy01_dy20 > 0 && ivz <= 0)) {
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)ivz / (float)dy01_dy20));
} else if ((iarea > 0 && dy01_dy20 > 0 && ivz < 0) || (iarea < 0 && dy01_dy20 < 0 && ivz > 0)) {
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)ivz / (float)dy01_dy20));
}
int32_t iux = ivx - ixmin * dy01;
int32_t iuy = ivy - ixmin * dy20;
int32_t iuz = ivz + ixmin * dy01_dy20;
for (int32_t ix = ixmin; ix <= ixmax; ++ix) {
const float bcx = (float)iux * inv_area;
const float bcy = (float)iuy * inv_area;
const float bcz = (float)iuz * inv_area;
assert(bcx >= 0.0f && bcy >= 0.0f && bcz >= 0.0f);
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, _mm_load1_ps(&bcz));
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, _mm_load1_ps(&bcy));
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, _mm_load1_ps(&bcx));
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled);
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero);
_mm_storeu_si32(&framebufferRow[ix], imm_c);
iux -= dy01;
iuy -= dy20;
iuz += dy01_dy20;
}
ivx += dx01;
ivy += dx20;
ivz -= dx01 + dx20;
framebufferRow += ctx->m_Width;
}
}
// Same as swrDrawTriangle_4() but with X and Y counters in XMM regs.
//
// avg: 1.38ms, min, 1.29ms, max: 1.67ms
static void swrDrawTriangle_5(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t dx20 = x2 - x0;
const int32_t dx10 = x1 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy10 = y1 - y0;
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i xmm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero));
const int32_t dy01 = -dy10;
const int32_t dx01 = -dx10;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
const float inv_dy01 = 1.0f / (float)dy01;
const float inv_dy20 = 1.0f / (float)dy20;
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20;
const int32_t dx0min = x0 - bboxMinX;
const int32_t dy0min = y0 - bboxMinY;
const __m128i imm_div = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
const int32_t iv0 = dx0min * dy01 - dy0min * dx01;
const int32_t iv1 = dx0min * dy20 - dy0min * dx20;
const int32_t iv2 = iarea - iv0 - iv1;
__m128i imm_iv = _mm_set_epi32(0, iv2, iv1, iv0);
const __m128i imm_diu = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t iv[4];
_mm_storeu_si128((__m128i*)&iv[0], imm_iv);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)iv[0] * inv_dy01));
} else if (iv[0] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)iv[0] * inv_dy01));
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)iv[1] * inv_dy20));
} else if (iv[1] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)iv[1] * inv_dy20));
}
if (dy01_dy20 < 0 && iv[2] >= 0) {
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)iv[2] * inv_dy01_dy20));
} else if (dy01_dy20 > 0 && iv[2] < 0) {
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)iv[2] * inv_dy01_dy20));
}
}
__m128i imm_iu = _mm_add_epi32(imm_iv, _mm_set_epi32(0, ixmin * dy01_dy20, -ixmin * dy20, -ixmin * dy01));
for (int32_t ix = ixmin; ix <= ixmax; ++ix) {
assert(_mm_movemask_ps(_mm_castsi128_ps(imm_iu)) == 0);
const __m128 xmm_bc = _mm_mul_ps(_mm_cvtepi32_ps(imm_iu), xmm_inv_area);
const __m128 xmm_bc0 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bc1 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_bc2 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, xmm_bc2);
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, xmm_bc1);
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, xmm_bc0);
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled);
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero);
_mm_storeu_si32(&framebufferRow[ix], imm_c);
imm_iu = _mm_add_epi32(imm_iu, imm_diu);
}
imm_iv = _mm_add_epi32(imm_iv, imm_div);
framebufferRow += ctx->m_Width;
}
}
#define _mm_shuffle_si128(a, b, imm8) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), imm8))
static inline __m128i _mm_mullo_epi32_SSE2(const __m128i a, const __m128i b)
{
__m128i tmp1 = _mm_mul_epu32(a, b);
__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
}
// Same as swrDrawTriangle_5() but the inner loop has been unrolled to 4 pixels
// per iteration (plus handling of remainder pixels).
//
// avg: 1.17ms, min: 1.10ms, max: 1.23ms
static void swrDrawTriangle_6(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0);
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0);
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the next set of pixels
const __m128 xmm_x_duv4_duv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 0, 1, 0));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20;
const float inv_dy20 = 1.0f / (float)dy20;
const float inv_dy01 = 1.0f / (float)dy01;
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*)& row_uvw_[0], imm_row_uvw_);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[0] * inv_dy01));
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[0] * inv_dy01));
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[1] * inv_dy20));
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[1] * inv_dy20));
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)row_uvw_[2] * inv_dy01_dy20));
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)row_uvw_[2] * inv_dy01_dy20));
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
__m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3));
// Calculate color of each pixel
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v)));
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v)));
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
const __m128i imm_c_p1 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p1), imm_zero), imm_zero);
const __m128i imm_c_p2 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p2), imm_zero), imm_zero);
const __m128i imm_c_p3 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p3), imm_zero), imm_zero);
// Pack the 4 colors into a XMM registers and store into framebuffer
const __m128i imm_c_p0011 = _mm_shuffle_si128(imm_c_p0, imm_c_p1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i imm_c_p2233 = _mm_shuffle_si128(imm_c_p2, imm_c_p3, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i imm_c_p0123 = _mm_shuffle_si128(imm_c_p0011, imm_c_p2233, _MM_SHUFFLE(2, 0, 2, 0));
_mm_storeu_si128((__m128i*)frameBuffer, imm_c_p0123);
// Move on to the next set of pixels
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv4_duv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duvw_1);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duvw_1);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uvuv, xmm_p0uvuv, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0uvuv = _mm_add_ps(xmm_p0uvuv, xmm_x_duvw_1);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
// Same as swrDrawTriangle_6() but the inner loop increments the
// final 4 pixel barycentric coords (8 regs) directly.
//
// avg: 1.11ms, min: 1.06ms, max: 1.21ms
static void swrDrawTriangle_7(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0);
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0);
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20;
const float inv_dy20 = 1.0f / (float)dy20;
const float inv_dy01 = 1.0f / (float)dy01;
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*)& row_uvw_[0], imm_row_uvw_);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[0] * inv_dy01));
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[0] * inv_dy01));
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)row_uvw_[1] * inv_dy20));
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)row_uvw_[1] * inv_dy20));
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)row_uvw_[2] * inv_dy01_dy20));
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)row_uvw_[2] * inv_dy01_dy20));
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2));
__m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3));
__m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2));
__m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate color of each pixel
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v)));
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v)));
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
const __m128i imm_c_p1 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p1), imm_zero), imm_zero);
const __m128i imm_c_p2 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p2), imm_zero), imm_zero);
const __m128i imm_c_p3 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p3), imm_zero), imm_zero);
// Pack the 4 colors into a XMM registers and store into framebuffer
const __m128i imm_c_p0011 = _mm_shuffle_si128(imm_c_p0, imm_c_p1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i imm_c_p2233 = _mm_shuffle_si128(imm_c_p2, imm_c_p3, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i imm_c_p0123 = _mm_shuffle_si128(imm_c_p0011, imm_c_p2233, _MM_SHUFFLE(2, 0, 2, 0));
_mm_storeu_si128((__m128i*)frameBuffer, imm_c_p0123);
// Move on to the next set of pixels
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du4);
xmm_p1u = _mm_add_ps(xmm_p1u, xmm_x_du4);
xmm_p2u = _mm_add_ps(xmm_p2u, xmm_x_du4);
xmm_p3u = _mm_add_ps(xmm_p3u, xmm_x_du4);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv4);
xmm_p1v = _mm_add_ps(xmm_p1v, xmm_x_dv4);
xmm_p2v = _mm_add_ps(xmm_p2v, xmm_x_dv4);
xmm_p3v = _mm_add_ps(xmm_p3v, xmm_x_dv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
// http://dss.stephanierct.com/DevBlog/?p=8
static const float xmm_ones[] = { 1.0f, 1.0f, 1.0f, 1.0f };
static inline __m128 _mm_floor_ps_SSE2(__m128 x)
{
#if 1
__m128 j = _mm_load_ps(&xmm_ones[0]);
#else
__m128i v0 = _mm_setzero_si128();
__m128i v1 = _mm_cmpeq_epi32(v0, v0);
__m128i ji = _mm_srli_epi32(v1, 25);
__m128 j = _mm_castsi128_ps(_mm_slli_epi32(ji, 23)); //create vector 1.0f
#endif
__m128i i = _mm_cvttps_epi32(x);
__m128 fi = _mm_cvtepi32_ps(i);
__m128 igx = _mm_cmpgt_ps(fi, x);
j = _mm_and_ps(igx, j);
return _mm_sub_ps(fi, j);
}
static inline __m128 _mm_ceil_ps_SSE2(__m128 x)
{
#if 1
__m128 j = _mm_load_ps(&xmm_ones[0]);
#else
__m128i v0 = _mm_setzero_si128();
__m128i v1 = _mm_cmpeq_epi32(v0, v0);
__m128i ji = _mm_srli_epi32(v1, 25);
__m128 j = _mm_castsi128_ps(_mm_slli_epi32(ji, 23)); //create vector 1.0f
#endif
__m128i i = _mm_cvttps_epi32(x);
__m128 fi = _mm_cvtepi32_ps(i);
__m128 igx = _mm_cmplt_ps(fi, x);
j = _mm_and_ps(igx, j);
return _mm_add_ps(fi, j);
}
// Same as swrDrawTriangle_7() but with floorf() and ceilf() function replaced by
// _mm_floor_ps_SSE2() and _mm_ceil_ps_SSE2()
//
// avg: 1.03ms, min: 0.98ms, max: 1.13ms
static void swrDrawTriangle_8(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0);
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0);
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2));
__m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3));
__m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2));
__m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate color of each pixel
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v)));
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v)));
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
const __m128i imm_c_p1 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p1), imm_zero), imm_zero);
const __m128i imm_c_p2 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p2), imm_zero), imm_zero);
const __m128i imm_c_p3 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p3), imm_zero), imm_zero);
// Pack the 4 colors into a XMM registers and store into framebuffer
const __m128i imm_c_p0011 = _mm_shuffle_si128(imm_c_p0, imm_c_p1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i imm_c_p2233 = _mm_shuffle_si128(imm_c_p2, imm_c_p3, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i imm_c_p0123 = _mm_shuffle_si128(imm_c_p0011, imm_c_p2233, _MM_SHUFFLE(2, 0, 2, 0));
_mm_storeu_si128((__m128i*)frameBuffer, imm_c_p0123);
// Move on to the next set of pixels
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du4);
xmm_p1u = _mm_add_ps(xmm_p1u, xmm_x_du4);
xmm_p2u = _mm_add_ps(xmm_p2u, xmm_x_du4);
xmm_p3u = _mm_add_ps(xmm_p3u, xmm_x_du4);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv4);
xmm_p1v = _mm_add_ps(xmm_p1v, xmm_x_dv4);
xmm_p2v = _mm_add_ps(xmm_p2v, xmm_x_dv4);
xmm_p3v = _mm_add_ps(xmm_p3v, xmm_x_dv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
// Similar to swrDrawTriangle_8() except from the way the 4 pixel colors are packed into 4 DWORDs.
//
// avg: 0.90ms, min: 0.86ms, max: 1.00ms
static void swrDrawTriangle_9(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_c2_c0 = _mm_sub_ps(xmm_c2, xmm_c0);
const __m128 xmm_c1_c0 = _mm_sub_ps(xmm_c1, xmm_c0);
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_p0u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 xmm_p0v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 xmm_p1u = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(2, 2, 2, 2));
__m128 xmm_p1v = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p0uv_p1uv, _MM_SHUFFLE(3, 3, 3, 3));
__m128 xmm_p2u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(0, 0, 0, 0));
__m128 xmm_p2v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 xmm_p3u = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 2, 2, 2));
__m128 xmm_p3v = _mm_shuffle_ps(xmm_p2uv_p3uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 3, 3, 3));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128 xmm_c_p1 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p1u), _mm_mul_ps(xmm_c1_c0, xmm_p1v)));
const __m128 xmm_c_p2 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p2u), _mm_mul_ps(xmm_c1_c0, xmm_p2v)));
const __m128 xmm_c_p3 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p3u), _mm_mul_ps(xmm_c1_c0, xmm_p3v)));
// Convert to uint32_t
const __m128i imm_rgba_p0_u32 = _mm_cvtps_epi32(xmm_c_p0);
const __m128i imm_rgba_p1_u32 = _mm_cvtps_epi32(xmm_c_p1);
const __m128i imm_rgba_p2_u32 = _mm_cvtps_epi32(xmm_c_p2);
const __m128i imm_rgba_p3_u32 = _mm_cvtps_epi32(xmm_c_p3);
// Pack into uint16_t
const __m128i imm_rgba_p01_u16 = _mm_packs_epi32(imm_rgba_p0_u32, imm_rgba_p1_u32);
const __m128i imm_rgba_p23_u16 = _mm_packs_epi32(imm_rgba_p2_u32, imm_rgba_p3_u32);
// Pack into uint8_t
const __m128i imm_rgba_p0123_u8 = _mm_packus_epi16(imm_rgba_p01_u16, imm_rgba_p23_u16);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du4);
xmm_p1u = _mm_add_ps(xmm_p1u, xmm_x_du4);
xmm_p2u = _mm_add_ps(xmm_p2u, xmm_x_du4);
xmm_p3u = _mm_add_ps(xmm_p3u, xmm_x_du4);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv4);
xmm_p1v = _mm_add_ps(xmm_p1v, xmm_x_dv4);
xmm_p2v = _mm_add_ps(xmm_p2v, xmm_x_dv4);
xmm_p3v = _mm_add_ps(xmm_p3v, xmm_x_dv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_c_p0 = _mm_add_ps(xmm_c0, _mm_add_ps(_mm_mul_ps(xmm_c2_c0, xmm_p0u), _mm_mul_ps(xmm_c1_c0, xmm_p0v)));
const __m128i imm_c_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_c_p0);
xmm_p0u = _mm_add_ps(xmm_p0u, xmm_x_du);
xmm_p0v = _mm_add_ps(xmm_p0v, xmm_x_dv);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
// Similar to swrDrawTriangle_9() but each of the RGBA channels is treated as a separate attribute
// to be interpolated over the triangle.
//
// avg: 0.94ms, min: 0.90ms, max: 1.01ms
static void swrDrawTriangle(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// https://stackoverflow.com/questions/24595003/permuting-bytes-inside-sse-m128i-register
// _mm_shuffle_epi8() with SSE2
__m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF);
// (uint8_t){ r0, r2, g0, g2, b0, b2, a0, a2, r1, r3, g1, g3, b1, b3, a1, a3 }
const __m128i imm_r02_g02_b02_a02_r13_g13_b13_a13_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r0123_g0123_b0123_a0123_u8, mask),
_mm_srli_epi16(imm_r0123_g0123_b0123_a0123_u8, 8)
);
// (uint8_t){ r0, g0, b0, a0, r1, g1, b1, a1, r2, g2, b3, a2, r3, g3, b3, a3 }
const __m128i imm_rgba_p0123_u8 =
_mm_packus_epi16(
_mm_and_si128(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, mask),
_mm_srli_epi16(imm_r02_g02_b02_a02_r13_g13_b13_a13_u8, 8)
);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
// Same as swrDrawTriangle() but with SSSE3 _mm_shuffle_epi8
//
// avg: 0.85ms, min: 0.80ms, max: 0.91ms
static void swrDrawTriangle_SSSE3(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps_SSE2(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps_SSE2(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32_SSE2(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// Shuffle into RGBA uint32_t
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
// Same as swrDrawTriangle() but uses SSE 4.1 instructions for floorf(), ceilf() and mullo_epi32()
//
// avg: 0.82ms, min: 0.78ms, max: 0.94ms
static void swrDrawTriangle_SSE41(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i imm_zero = _mm_setzero_si128();
const __m128 xmm_rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), imm_zero), imm_zero));
const __m128 xmm_rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), imm_zero), imm_zero));
const __m128 xmm_rgba2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), imm_zero), imm_zero));
const __m128 xmm_drgba20 = _mm_sub_ps(xmm_rgba2, xmm_rgba0);
const __m128 xmm_drgba10 = _mm_sub_ps(xmm_rgba1, xmm_rgba0);
const __m128 xmm_r0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_g0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_b0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_a0 = _mm_shuffle_ps(xmm_rgba0, xmm_rgba0, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da20 = _mm_shuffle_ps(xmm_drgba20, xmm_drgba20, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 xmm_dr10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_dg10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_db10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_da10 = _mm_shuffle_ps(xmm_drgba10, xmm_drgba10, _MM_SHUFFLE(3, 3, 3, 3));
const int32_t dy01 = y0 - y1;
const int32_t dx01 = x0 - x1;
const int32_t dx20 = x2 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_zero = _mm_setzero_ps();
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
// Barycentric coordinate deltas for the X direction
const __m128i imm_x_duvw_ = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
const __m128 xmm_x_duvw_1 = _mm_mul_ps(_mm_cvtepi32_ps(imm_x_duvw_), xmm_inv_area);
const __m128 xmm_x_duvw_2 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_1);
const __m128 xmm_x_duvw_3 = _mm_add_ps(xmm_x_duvw_1, xmm_x_duvw_2);
const __m128 xmm_x_duvw_4 = _mm_add_ps(xmm_x_duvw_2, xmm_x_duvw_2);
// UV deltas for the 1st and 2nd pixel
const __m128 xmm_x_duv0_duv1 = _mm_shuffle_ps(xmm_zero, xmm_x_duvw_1, _MM_SHUFFLE(1, 0, 1, 0));
// UV deltas for the 3rd and 4th pixel
const __m128 xmm_x_duv2_duv3 = _mm_shuffle_ps(xmm_x_duvw_2, xmm_x_duvw_3, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 xmm_x_du = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv = _mm_shuffle_ps(xmm_x_duvw_1, xmm_x_duvw_1, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_x_du4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_x_dv4 = _mm_shuffle_ps(xmm_x_duvw_4, xmm_x_duvw_4, _MM_SHUFFLE(1, 1, 1, 1));
// Barycentric coordinate deltas for the Y direction
const __m128i imm_y_duvw_ = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
// Calculate unnormalized barycentric coordinates of the bounding box min.
const int32_t bboxMin_u = (x0 - bboxMinX) * dy01 - (y0 - bboxMinY) * dx01;
const int32_t bboxMin_v = (x0 - bboxMinX) * dy20 - (y0 - bboxMinY) * dx20;
const int32_t bboxMin_w = iarea - bboxMin_u - bboxMin_v;
__m128i imm_row_uvw_ = _mm_set_epi32(0, bboxMin_w, bboxMin_v, bboxMin_u);
//
const __m128 xmm_row_uvw_scale = _mm_set_ps(0.0f, 1.0f / (float)dy01_dy20, 1.0f / (float)dy20, 1.0f / (float)dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
int32_t row_uvw_[4];
_mm_storeu_si128((__m128i*) & row_uvw_[0], imm_row_uvw_);
const __m128 xmm_row_uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_row_uvw_), xmm_row_uvw_scale);
const __m128i imm_row_uvw_floor = _mm_cvtps_epi32(_mm_floor_ps(xmm_row_uvw_));
const __m128i imm_row_uvw_ceil = _mm_cvtps_epi32(_mm_ceil_ps(xmm_row_uvw_));
int32_t row_uvw_floor[4];
_mm_storeu_si128((__m128i*) & row_uvw_floor[0], imm_row_uvw_floor);
int32_t row_uvw_ceil[4];
_mm_storeu_si128((__m128i*) & row_uvw_ceil[0], imm_row_uvw_ceil);
if (dy01 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[0]);
} else if (row_uvw_[0] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[0]);
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, row_uvw_floor[1]);
} else if (row_uvw_[1] != 0) {
ixmin = swr_maxi(ixmin, row_uvw_ceil[1]);
}
if (dy01_dy20 < 0 && row_uvw_[2] >= 0) {
ixmax = swr_mini(ixmax, -row_uvw_ceil[2]);
} else if (dy01_dy20 > 0 && row_uvw_[2] < 0) {
ixmin = swr_maxi(ixmin, -row_uvw_floor[2]);
}
}
if (ixmin <= ixmax) {
// Calculate normalized barycentric coordinates at ixmin of the current row of pixels.
const __m128i imm_p0uvw_ = _mm_add_epi32(imm_row_uvw_, _mm_mullo_epi32(_mm_set1_epi32(ixmin), imm_x_duvw_));
const __m128 xmm_p0uvw_ = _mm_mul_ps(_mm_cvtepi32_ps(imm_p0uvw_), xmm_inv_area);
const __m128 xmm_p0uvuv = _mm_shuffle_ps(xmm_p0uvw_, xmm_p0uvw_, _MM_SHUFFLE(1, 0, 1, 0));
// Calculate barycentric coordinates for the 4 pixels.
const __m128 xmm_p0uv_p1uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv0_duv1); // Barycentric coordinates of 1st and 2nd pixels
const __m128 xmm_p2uv_p3uv = _mm_add_ps(xmm_p0uvuv, xmm_x_duv2_duv3); // Barycentric coordinates of 3rd and 4th pixels
// Extract barycentric coordinates for each pixel
__m128 xmm_u0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(2, 0, 2, 0));
__m128 xmm_v0123 = _mm_shuffle_ps(xmm_p0uv_p1uv, xmm_p2uv_p3uv, _MM_SHUFFLE(3, 1, 3, 1));
uint32_t* frameBuffer = &framebufferRow[ixmin];
const uint32_t numPixels = (uint32_t)((ixmax - ixmin) + 1);
const uint32_t numIter = numPixels >> 2; // 4 pixels per iteration
for (uint32_t iIter = 0; iIter < numIter; ++iIter) {
// Calculate the color of each pixel
const __m128 xmm_r_p0123 = _mm_add_ps(xmm_r0, _mm_add_ps(_mm_mul_ps(xmm_dr20, xmm_u0123), _mm_mul_ps(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0123 = _mm_add_ps(xmm_g0, _mm_add_ps(_mm_mul_ps(xmm_dg20, xmm_u0123), _mm_mul_ps(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0123 = _mm_add_ps(xmm_b0, _mm_add_ps(_mm_mul_ps(xmm_db20, xmm_u0123), _mm_mul_ps(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0123 = _mm_add_ps(xmm_a0, _mm_add_ps(_mm_mul_ps(xmm_da20, xmm_u0123), _mm_mul_ps(xmm_da10, xmm_v0123)));
// Pack into uint8_t
// (uint8_t){ r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3, a0, a1, a2, a3 }
const __m128i imm_r0123_g0123_b0123_a0123_u8 = _mm_packus_epi16(
_mm_packs_epi32(_mm_cvtps_epi32(xmm_r_p0123), _mm_cvtps_epi32(xmm_g_p0123)),
_mm_packs_epi32(_mm_cvtps_epi32(xmm_b_p0123), _mm_cvtps_epi32(xmm_a_p0123))
);
// Shuffle into RGBA uint32_t
const __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i imm_rgba_p0123_u8 = _mm_shuffle_epi8(imm_r0123_g0123_b0123_a0123_u8, mask);
// Store
_mm_storeu_si128((__m128i*)frameBuffer, imm_rgba_p0123_u8);
// Move on to the next set of pixels
xmm_u0123 = _mm_add_ps(xmm_u0123, xmm_x_du4);
xmm_v0123 = _mm_add_ps(xmm_v0123, xmm_x_dv4);
frameBuffer += 4;
}
// Handle the remainder of pixels for this row
const uint32_t rem = numPixels & 3;
switch (rem) {
case 3: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 2: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
frameBuffer++;
} // fallthrough
case 1: {
const __m128 xmm_r_p0 = _mm_add_ss(xmm_r0, _mm_add_ss(_mm_mul_ss(xmm_dr20, xmm_u0123), _mm_mul_ss(xmm_dr10, xmm_v0123)));
const __m128 xmm_g_p0 = _mm_add_ss(xmm_g0, _mm_add_ss(_mm_mul_ss(xmm_dg20, xmm_u0123), _mm_mul_ss(xmm_dg10, xmm_v0123)));
const __m128 xmm_b_p0 = _mm_add_ss(xmm_b0, _mm_add_ss(_mm_mul_ss(xmm_db20, xmm_u0123), _mm_mul_ss(xmm_db10, xmm_v0123)));
const __m128 xmm_a_p0 = _mm_add_ss(xmm_a0, _mm_add_ss(_mm_mul_ss(xmm_da20, xmm_u0123), _mm_mul_ss(xmm_da10, xmm_v0123)));
const __m128 xmm_rrgg_p0 = _mm_shuffle_ps(xmm_r_p0, xmm_g_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bbaa_p0 = _mm_shuffle_ps(xmm_b_p0, xmm_a_p0, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_rgba_p0 = _mm_shuffle_ps(xmm_rrgg_p0, xmm_bbaa_p0, _MM_SHUFFLE(2, 0, 2, 0));
const __m128i imm_rgba_p0 = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_rgba_p0), imm_zero), imm_zero);
_mm_storeu_si32(frameBuffer, imm_rgba_p0);
xmm_u0123 = _mm_add_ss(xmm_u0123, xmm_x_du);
xmm_v0123 = _mm_add_ss(xmm_v0123, xmm_x_dv);
} // fallthrough
case 0:
default:
break;
}
}
// Move on to the next row of pixels.
imm_row_uvw_ = _mm_add_epi32(imm_row_uvw_, imm_y_duvw_);
framebufferRow += ctx->m_Width;
}
}
static void swrDrawText(swr_context* ctx, const swr_font* font, int32_t x0, int32_t y0, const char* str, const char* end, uint32_t color)
{
end = end != NULL
? end
: str + strlen(str)
;
const int32_t chw = (int32_t)font->m_CharWidth;
const int32_t chh = (int32_t)font->m_CharHeight;
const uint8_t* chdata = font->m_CharData;
int32_t x = x0;
int32_t y = y0;
while (str != end) {
char ch = *str;
if (ch < font->m_CharMin || ch > font->m_CharMax) {
ch = font->m_MissingCharFallbackID;
}
const uint8_t chID = (uint8_t)ch - font->m_CharMin;
const uint8_t* charData = &chdata[chID * chh];
for (int32_t chy = 0;chy < chh;++chy) {
const uint8_t chrow = charData[chy];
for (int32_t chx = 0; chx < chw; ++chx) {
if ((chrow & (1u << chx)) != 0) {
swrDrawPixel(ctx, x + chx, y + chy, color);
}
}
}
x += chw;
++str;
}
}
static void movAvgPush(moving_averaged* avg, double val)
{
const uint32_t id = avg->m_NextItemID;
avg->m_NumItems++;
if (avg->m_NumItems >= 128) {
avg->m_NumItems = 128;
}
avg->m_NextItemID = (avg->m_NextItemID + 1) % avg->m_NumItems;
avg->m_Value[id] = val;
}
static double movAvgGetAverage(const moving_averaged* avg)
{
const uint32_t n = avg->m_NumItems;
double sum = 0.0;
for (uint32_t i = 0; i < n; ++i) {
sum += avg->m_Value[i];
}
return sum / (double)n;
}
static void movAvgGetMinMax(const moving_averaged* avg, double* tmin, double* tmax)
{
const uint32_t n = avg->m_NumItems;
double minT = avg->m_Value[0];
double maxT = avg->m_Value[0];
for (uint32_t i = 1; i < n; ++i) {
minT = avg->m_Value[i] < minT ? avg->m_Value[i] : minT;
maxT = avg->m_Value[i] > maxT ? avg->m_Value[i] : maxT;
}
*tmin = minT;
*tmax = maxT;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment