Skip to content

Instantly share code, notes, and snippets.

@jdryg
Last active April 2, 2018 07:25
Show Gist options
  • Save jdryg/9bb267b5ce1088eaf7399a9ab7dbe0ad to your computer and use it in GitHub Desktop.
Save jdryg/9bb267b5ce1088eaf7399a9ab7dbe0ad to your computer and use it in GitHub Desktop.
strokerConvexFillAA (benchmark)
#include <stdint.h>
#include <inttypes.h>
#include <math.h>
#include <malloc.h>
#include <memory.h>
#include <Windows.h>
#include <stdio.h>
#include <xmmintrin.h>
#include <immintrin.h>
#include "iacaMarks.h"
#define INSERT_IACA_MARKERS 0
#define CREATE_VERTEX_BUFFER 0
#define CREATE_INDEX_BUFFER 1
#ifdef _DEBUG
#define NUM_ITERATIONS 1
#else
#if CREATE_VERTEX_BUFFER
#define NUM_ITERATIONS 1000000
#else
#define NUM_ITERATIONS 10000000
#endif
#endif
#define NUM_VERTICES 1024
#define SIMD_INDEX_BUFFER 1
#define TEST_SIMD 3
#define VERIFY_SIMD 1
// 0: 1.0 / sqrt
// 1: _mm_rsqrt_ss
// 2: Newton/Raphson
#define RSQRT_ALGORITHM 2
// 0: 1.0 / a
// 1: _mm_rcp_ss
// 2: Newton/Raphson
#define RCP_ALGORITHM 1
#define VG_EPSILON 1e-5f
#define PI 3.1415926f
struct Vec2
{
float x, y;
};
static inline float fsign(float a)
{
return a < 0.0f ? -1.0f : 1.0f;
}
static const __m128 xmm_half = _mm_set_ps1(0.5f);
static const __m128 xmm_one = _mm_set_ps1(1.0f);
static const __m128 xmm_three = _mm_set_ps1(3.0f);
static const __m128 oneish = _mm_castsi128_ps(_mm_set1_epi32(0x3f800001));
static const __m128 vec2_perpCCW_xorMask = _mm_castsi128_ps(_mm_set_epi32(0, 0, 0, 0x80000000));
static inline float rsqrt(float a)
{
#if RSQRT_ALGORITHM == 0
return 1.0f / sqrtf(a);
#elif RSQRT_ALGORITHM == 1
float res;
__m128 rsqrtRes = _mm_rsqrt_ss(_mm_load_ss(&a));
_mm_store_ss(&res, rsqrtRes);
return res;
#elif RSQRT_ALGORITHM == 2
const __m128 xmm_a = _mm_load_ss(&a);
const __m128 rsqrtEst = _mm_rsqrt_ss(xmm_a);
const __m128 iter0 = _mm_mul_ss(xmm_a, rsqrtEst);
const __m128 iter1 = _mm_mul_ss(iter0, rsqrtEst);
const __m128 half_rsqrt = _mm_mul_ss(xmm_half, rsqrtEst);
const __m128 three_sub_iter1 = _mm_sub_ss(xmm_three, iter1);
const __m128 result = _mm_mul_ss(half_rsqrt, three_sub_iter1);
float res;
_mm_store_ss(&res, result);
return res;
#endif
}
static inline float rcp(float a)
{
#if RCP_ALGORITHM == 0
return 1.0f / a;
#elif RCP_ALGORITHM == 1
float res;
__m128 rcpRes = _mm_rcp_ss(_mm_load_ss(&a));
_mm_store_ss(&res, rcpRes);
return res;
#elif RCP_ALGORITHM == 2
const __m128 xmm_a = _mm_load_ss(&a);
const __m128 est = _mm_rcp_ss(xmm_a);
const __m128 tmp1 = _mm_sub_ss(_mm_mul_ss(xmm_a, est), oneish);
const __m128 result = _mm_add_ss(_mm_mul_ss(tmp1, est), est);
float res;
_mm_store_ss(&res, result);
return res;
#endif
}
inline Vec2 vec2Add(const Vec2& a, const Vec2& b) { return{ a.x + b.x, a.y + b.y }; }
inline Vec2 vec2Sub(const Vec2& a, const Vec2& b) { return{ a.x - b.x, a.y - b.y }; }
inline Vec2 vec2Scale(const Vec2& a, float s) { return{ a.x * s, a.y * s }; }
inline Vec2 vec2PerpCCW(const Vec2& a) { return{ -a.y, a.x }; }
inline Vec2 vec2PerpCW(const Vec2& a) { return{ a.y, -a.x }; }
inline float vec2Cross(const Vec2& a, const Vec2& b) { return a.x * b.y - b.x * a.y; }
inline float vec2Dot(const Vec2& a, const Vec2& b) { return a.x * b.x + a.y * b.y; }
// Direction from a to b
inline Vec2 vec2Dir(const Vec2& a, const Vec2& b)
{
const float dx = b.x - a.x;
const float dy = b.y - a.y;
const float lenSqr = dx * dx + dy * dy;
#if TEST_SIMD && VERIFY_SIMD
const float invLen = lenSqr < VG_EPSILON ? 0.0f : 1.0f / sqrtf(lenSqr);
#else
const float invLen = lenSqr < VG_EPSILON ? 0.0f : rsqrt(lenSqr);
#endif
return{ dx * invLen, dy * invLen };
}
inline Vec2 calcExtrusionVector(const Vec2& d01, const Vec2& d12)
{
// v is the vector from the path point to the outline point, assuming a stroke width of 1.0.
// Equation obtained by solving the intersection of the 2 line segments. d01 and d12 are
// assumed to be normalized.
Vec2 v = vec2PerpCCW(d01);
const float cross = vec2Cross(d12, d01);
if (fabsf(cross) > VG_EPSILON) {
#if TEST_SIMD && VERIFY_SIMD
v = vec2Scale(vec2Sub(d01, d12), 1.0f / cross);
#else
v = vec2Scale(vec2Sub(d01, d12), rcp(cross));
#endif
}
return v;
}
struct Stroker
{
float m_FringeWidth;
uint32_t m_NumVertices;
uint32_t m_NumIndices;
uint32_t m_VertexCapacity;
Vec2* m_PosBuffer;
uint32_t* m_ColorBuffer;
uint16_t* m_IndexBuffer;
uint32_t m_IndexCapacity;
};
static void resetGeometry(Stroker* stroker)
{
stroker->m_NumVertices = 0;
stroker->m_NumIndices = 0;
}
static void reallocVB(Stroker* stroker, uint32_t n)
{
stroker->m_VertexCapacity += n;
stroker->m_PosBuffer = (Vec2*)_aligned_realloc(stroker->m_PosBuffer, sizeof(Vec2) * stroker->m_VertexCapacity, 16);
stroker->m_ColorBuffer = (uint32_t*)_aligned_realloc(stroker->m_ColorBuffer, sizeof(uint32_t) * stroker->m_VertexCapacity, 16);
#if 0
memset(stroker->m_PosBuffer, 0xFF, sizeof(Vec2) * stroker->m_VertexCapacity);
#endif
}
static void expandVB(Stroker* stroker, uint32_t n)
{
if (stroker->m_NumVertices + n > stroker->m_VertexCapacity) {
reallocVB(stroker, n);
}
}
static void reallocIB(Stroker* stroker, uint32_t n)
{
stroker->m_IndexCapacity += n;
stroker->m_IndexBuffer = (uint16_t*)_aligned_realloc(stroker->m_IndexBuffer, sizeof(uint16_t) * stroker->m_IndexCapacity, 16);
}
static void expandIB(Stroker* stroker, uint32_t n)
{
if (stroker->m_NumIndices + n > stroker->m_IndexCapacity) {
reallocIB(stroker, n);
}
}
static void strokerConvexFillAA(Stroker* stroker, const float* vertexList, uint32_t numVertices)
{
const Vec2* vtx = (const Vec2*)vertexList;
const float cross = vec2Cross(vec2Sub(vtx[1], vtx[0]), vec2Sub(vtx[2], vtx[0]));
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross);
const uint32_t numTris =
(numVertices - 2) + // Triangle fan
(numVertices * 2); // AA fringes
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point.
const uint32_t numDrawIndices = numTris * 3;
resetGeometry(stroker);
#if CREATE_VERTEX_BUFFER
// Vertex buffer
{
expandVB(stroker, numDrawVertices);
Vec2 d01 = vec2Dir(vtx[numVertices - 1], vtx[0]);
Vec2* dstPos = stroker->m_PosBuffer;
for (uint32_t iSegment = 0; iSegment < numVertices; ++iSegment) {
const Vec2& p1 = vtx[iSegment];
const Vec2& p2 = vtx[iSegment == numVertices - 1 ? 0 : iSegment + 1];
const Vec2 d12 = vec2Dir(p1, p2);
const Vec2 v = calcExtrusionVector(d01, d12);
const Vec2 v_aa = vec2Scale(v, aa);
dstPos[0] = vec2Add(p1, v_aa);
dstPos[1] = vec2Sub(p1, v_aa);
dstPos += 2;
d01 = d12;
}
stroker->m_NumVertices += numDrawVertices;
}
#endif // CREATE_VERTEX_BUFFER
#if CREATE_INDEX_BUFFER
// Index buffer
{
expandIB(stroker, numDrawIndices);
uint16_t* dstIndex = stroker->m_IndexBuffer;
// First fringe quad
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3;
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2;
dstIndex += 6;
const uint32_t numFanTris = numVertices - 2;
uint16_t secondTriVertex = 2;
for (uint32_t i = 0; i < numFanTris; ++i) {
const uint16_t id0 = secondTriVertex;
const uint16_t id1 = secondTriVertex + 1;
const uint16_t id2 = secondTriVertex + 2;
const uint16_t id3 = secondTriVertex + 3;
// Fan triangle
dstIndex[0] = 0;
dstIndex[1] = id0;
dstIndex[2] = id2;
// Fringe quad
dstIndex[3] = id0;
dstIndex[4] = id1;
dstIndex[5] = id3;
dstIndex[6] = id0;
dstIndex[7] = id3;
dstIndex[8] = id2;
dstIndex += 9;
secondTriVertex += 2;
}
// Last fringe quad
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1);
dstIndex[0] = lastID;
dstIndex[1] = lastID + 1;
dstIndex[2] = 1;
dstIndex[3] = lastID;
dstIndex[4] = 1;
dstIndex[5] = 0;
stroker->m_NumIndices += numDrawIndices;
}
#endif // CREATE_INDEX_BUFFER
}
static inline float _mm_vec2_cross(const __m128 a, const __m128 b)
{
const __m128 axy_bxy = _mm_movelh_ps(a, b); // { a.x, a.y, b.x, b.y }
const __m128 byx_ayx = _mm_shuffle_ps(axy_bxy, axy_bxy, _MM_SHUFFLE(0, 1, 2, 3)); // { b.y, b.x, a.y, a.x }
const __m128 axby_aybx = _mm_mul_ps(axy_bxy, byx_ayx); // { a.x * b.y, a.y * b.x, b.x * a.y, b.y * a.x }
const __m128 bxay = _mm_shuffle_ps(axby_aybx, axby_aybx, _MM_SHUFFLE(1, 1, 1, 1)); // { a.y * b.x, a.y * b.x, a.y * b.x, a.y * b.x }
const __m128 cross = _mm_sub_ss(axby_aybx, bxay);
return _mm_cvtss_f32(cross);
}
static inline __m128 _mm_vec2_dir(const __m128 a, const __m128 b)
{
const __m128 dxy = _mm_sub_ps(b, a); // { dx, dy, DC, DC }
const __m128 dxySqr = _mm_mul_ps(dxy, dxy); // { dx * dx, dy * dy, DC, DC }
const __m128 dySqr = _mm_shuffle_ps(dxySqr, dxySqr, _MM_SHUFFLE(1, 1, 1, 1)); // { dy * dy, dy * dy, dy * dy, dy * dy }
const float lenSqr = _mm_cvtss_f32(_mm_add_ss(dxySqr, dySqr));
__m128 dir = _mm_setzero_ps();
if (lenSqr >= VG_EPSILON) {
const __m128 invLen = _mm_set_ps1(rsqrt(lenSqr));
dir = _mm_mul_ps(dxy, invLen);
}
return dir;
}
static inline __m128 _mm_vec2_rotCCW90(const __m128 a)
{
__m128 ayx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)); // { a.y, a.x, DC, DC }
return _mm_xor_ps(ayx, vec2_perpCCW_xorMask); // { -a.y, a.x, DC, DC }
}
static inline __m128 calcExtrusionVector(const __m128 d01, const __m128 d12)
{
const float cross = _mm_vec2_cross(d12, d01);
return (fabs(cross) > VG_EPSILON) ? _mm_mul_ps(_mm_sub_ps(d01, d12), _mm_set_ps1(rcp(cross))) : _mm_vec2_rotCCW90(d01);
}
static void strokerConvexFillAA_SIMD(Stroker* stroker, const float* vertexList, uint32_t numVertices)
{
const uint32_t lastVertexID = numVertices - 1;
const __m128 vtx0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)vertexList);
const __m128 vtx1 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 2));
const __m128 vtx2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 4));
const float cross = _mm_vec2_cross(_mm_sub_ps(vtx1, vtx0), _mm_sub_ps(vtx2, vtx0));
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross);
const __m128 xmm_aa = _mm_set_ps1(aa);
const uint32_t numTris =
(numVertices - 2) + // Triangle fan
(numVertices * 2); // AA fringes
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point.
const uint32_t numDrawIndices = numTris * 3;
resetGeometry(stroker);
#if CREATE_VERTEX_BUFFER
// Vertex buffer
{
expandVB(stroker, numDrawVertices);
const __m128 vtxLast = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + (lastVertexID << 1)));
__m128 d01 = _mm_vec2_dir(vtxLast, vtx0);
__m128 p1 = vtx0;
const float* srcPos = vertexList + 2;
float* dstPos = &stroker->m_PosBuffer->x;
const uint32_t numIter = lastVertexID >> 1;
for (uint32_t iSegment = 0; iSegment < numIter; ++iSegment) {
// srcPos alignment unknown
const __m128 p23 = _mm_loadu_ps(srcPos); // { p2.x, p2.y, p3.x, p3.y }
const __m128 p2 = p23; // { p2.x, p2.y, DC, DC }
const __m128 p3 = _mm_movehl_ps(p23, p23); // { p3.x, p3.y, DC, DC }
const __m128 d12 = _mm_vec2_dir(p1, p2); // { d12.x, d12.y, DC, DC }
const __m128 d23 = _mm_vec2_dir(p2, p3); // { d23.x, d23.y, DC, DC }
const __m128 v012 = calcExtrusionVector(d01, d12); // { v012.x, v012.y, DC, DC }
const __m128 v123 = calcExtrusionVector(d12, d23); // { v123.x, v123.y, DC, DC }
const __m128 v012_123 = _mm_movelh_ps(v012, v123); // { v012.x, v012.y, v123.x, v123.y }
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa); // { v012.x * aa, v012.y * aa, v123.x * aa, v123.y * aa }
const __m128 p12 = _mm_movelh_ps(p1, p2); // { p1.x, p1.y, p2.x, p2.y }
const __m128 posEdge = _mm_add_ps(p12, v012_v123_aa); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p2.x + v123.x * aa, p2.y + v123.y * aa }
const __m128 negEdge = _mm_sub_ps(p12, v012_v123_aa); // { p1.x - v012.x * aa, p1.y - v012.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa }
const __m128 packed0 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(1, 0, 1, 0)); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p1.x - v012.x * aa, p1.y - v012.y * aa }
const __m128 packed1 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(3, 2, 3, 2)); // { p2.x + v123.x * aa, p2.y + v123.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa }
// Aligned stores because dstPos is 16-byte aligned
_mm_store_ps(dstPos, packed0);
_mm_store_ps(dstPos + 4, packed1);
dstPos += 8;
srcPos += 4;
d01 = d23;
p1 = p3;
}
const uint32_t rem = (lastVertexID & 1);
if (rem) {
const __m128 p2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)srcPos);
const __m128 d12 = _mm_vec2_dir(p1, p2);
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, d12), xmm_aa);
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa));
_mm_store_ps(dstPos, packed);
dstPos += 4;
srcPos += 2;
d01 = d12;
p1 = p2;
}
// Last segment
{
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, _mm_vec2_dir(p1, vtx0)), xmm_aa);
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa));
_mm_storeu_ps(dstPos, packed);
}
stroker->m_NumVertices += numDrawVertices;
}
#endif // CREATE_VERTEX_BUFFER
#if CREATE_INDEX_BUFFER
// Index buffer
{
expandIB(stroker, numDrawIndices);
uint16_t* dstIndex = stroker->m_IndexBuffer;
// First fringe quad
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3;
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2;
dstIndex += 6;
const uint32_t numFanTris = numVertices - 2;
uint16_t secondTriVertex = 2;
for (uint32_t i = 0; i < numFanTris; ++i) {
const uint16_t id0 = secondTriVertex;
const uint16_t id1 = secondTriVertex + 1;
const uint16_t id2 = secondTriVertex + 2;
const uint16_t id3 = secondTriVertex + 3;
// Fan triangle
dstIndex[0] = 0;
dstIndex[1] = id0;
dstIndex[2] = id2;
// Fringe quad
dstIndex[3] = id0;
dstIndex[4] = id1;
dstIndex[5] = id3;
dstIndex[6] = id0;
dstIndex[7] = id3;
dstIndex[8] = id2;
dstIndex += 9;
secondTriVertex += 2;
}
// Last fringe quad
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1);
dstIndex[0] = lastID;
dstIndex[1] = lastID + 1;
dstIndex[2] = 1;
dstIndex[3] = lastID;
dstIndex[4] = 1;
dstIndex[5] = 0;
stroker->m_NumIndices += numDrawIndices;
}
#endif
}
static inline __m128 xmm_rsqrt(__m128 a)
{
#if RSQRT_ALGORITHM == 0
const __m128 res = _mm_div_ps(xmm_one, _mm_sqrt_ps(a));
#elif RSQRT_ALGORITHM == 1
const __m128 res = _mm_rsqrt_ps(a);
#elif RSQRT_ALGORITHM == 2
// Newton/Raphson
const __m128 rsqrtEst = _mm_rsqrt_ps(a);
const __m128 iter0 = _mm_mul_ps(a, rsqrtEst);
const __m128 iter1 = _mm_mul_ps(iter0, rsqrtEst);
const __m128 half_rsqrt = _mm_mul_ps(xmm_half, rsqrtEst);
const __m128 three_sub_iter1 = _mm_sub_ps(xmm_three, iter1);
const __m128 res = _mm_mul_ps(half_rsqrt, three_sub_iter1);
#endif
return res;
}
static inline __m128 xmm_rcp(__m128 a)
{
#if RCP_ALGORITHM == 0
const __m128 inv_a = _mm_div_ps(xmm_one, a);
#elif RCP_ALGORITHM == 1
const __m128 inv_a = _mm_rcp_ps(a);
#elif RCP_ALGORITHM == 2
// TODO:
#endif
return inv_a;
}
static void strokerConvexFillAA_SIMD2(Stroker* stroker, const float* vertexList, uint32_t numVertices)
{
const uint32_t lastVertexID = numVertices - 1;
const __m128 vtx0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)vertexList);
const __m128 vtx1 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 2));
const __m128 vtx2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 4));
const float cross = _mm_vec2_cross(_mm_sub_ps(vtx1, vtx0), _mm_sub_ps(vtx2, vtx0));
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross);
const __m128 xmm_aa = _mm_set_ps1(aa);
const uint32_t numTris =
(numVertices - 2) + // Triangle fan
(numVertices * 2); // AA fringes
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point.
const uint32_t numDrawIndices = numTris * 3;
resetGeometry(stroker);
#if CREATE_VERTEX_BUFFER
// Vertex buffer
{
expandVB(stroker, numDrawVertices);
const __m128 vtxLast = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + (lastVertexID << 1)));
__m128 d01 = _mm_vec2_dir(vtxLast, vtx0);
__m128 p1 = vtx0;
const float* srcPos = vertexList + 2;
float* dstPos = &stroker->m_PosBuffer->x;
const __m128 xmm_epsilon = _mm_set_ps1(VG_EPSILON);
const __m128 vec2x2_perpCCW_xorMask = _mm_castsi128_ps(_mm_set_epi32(0, 0x80000000, 0, 0x80000000));
const uint32_t numIter = lastVertexID >> 1;
for (uint32_t i = 0; i < numIter; ++i) {
#if TEST_SIMD == 2 && INSERT_IACA_MARKERS
IACA_VC64_START;
#endif
// Even if initial srcPos is 16-byte aligned we just skipped 8 bytes so it's not aligned anymore.
const __m128 p23 = _mm_loadu_ps(srcPos); // { p2.x, p2.y, p3.x, p3.y }
const __m128 p12 = _mm_movelh_ps(p1, p23); // { p1.x, p1.y, p2.x, p2.y }
__m128 d12, d23;
{
const __m128 d12_23 = _mm_sub_ps(p23, p12); // { p2.x - p1.x, p2.y - p1.y, p3.x - p2.x, p3.y - p2.y }
const __m128 d12_23_xy_sqr = _mm_mul_ps(d12_23, d12_23); // { d12.x * d12.x, d12.y * d12.y, d23.x * d23.x, d23.y * d23.y }
const __m128 d12_23_yx_sqr = _mm_shuffle_ps(d12_23_xy_sqr, d12_23_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1)); // { d12.y * d12.y, d12.x * d12.x, d23.y * d23.y, d23.x * d23.x }
const __m128 len12_23_sqr = _mm_add_ps(d12_23_xy_sqr, d12_23_yx_sqr); // { len12_sqr, len12_sqr, len23_sqr, len23_sqr }
const __m128 lenSqr_ge_eps = _mm_cmpge_ps(len12_23_sqr, xmm_epsilon); // { len12_sqr >= eps ? 0xFFFFFFFF : 0, ... }
const __m128 invLen12_23 = xmm_rsqrt(len12_23_sqr);
const __m128 invLen12_23_masked = _mm_and_ps(invLen12_23, lenSqr_ge_eps); // { len12_sqr >= eps ? rsqrt(len12_sqr) : 0, ... }
const __m128 d12_23_norm = _mm_mul_ps(d12_23, invLen12_23_masked); //
d12 = _mm_movelh_ps(d12_23_norm, d12_23_norm);
d23 = _mm_movehl_ps(d12_23_norm, d12_23_norm);
}
__m128 v012_123;
{
const __m128 d12xy_d01xy = _mm_movelh_ps(d12, d01); // { d12.x, d12.y, d01.x, d01.y }
const __m128 d23xy_d12xy = _mm_movelh_ps(d23, d12); // { d23.x, d23.y, d12.x, d12.y }
const __m128 d01yx_d12yx = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(0, 1, 2, 3)); // { d01.y, d01.x, d12.y, d12.x }
const __m128 d12yx_d23yx = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(0, 1, 2, 3)); // { d12.y, d12.x, d23.y, d23.x }
const __m128 d12xd01y_d12yd01x = _mm_mul_ps(d12xy_d01xy, d01yx_d12yx); // { d12.x * d01.y, d12.y * d01.x, d01.x * d12.y, d01.y * d12.x }
const __m128 d23xd12y_d23yd12x = _mm_mul_ps(d23xy_d12xy, d12yx_d23yx); // { d23.x * d12.y, d23.y * d12.x, d12.x * d23.y, d12.y * d23.x }
const __m128 d12yd01x_d23yd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(1, 1, 1, 1)); // { d12.y * d01.x, d12.y * d01.x, d23.y * d12.x, d23.y * d12.x }
const __m128 d12xd01y_d23xd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(0, 0, 0, 0)); // { d12.x * d01.y, d12.x * d01.x, d23.x * d12.y, d23.x * d12.y }
const __m128 cross012_123 = _mm_sub_ps(d12xd01y_d23xd12x, d12yd01x_d23yd12x); // { cross(d12, d01), cross(d12, d01), cross(d23, d12), cross(d23, d12) }
const __m128 inv_cross012_123 = xmm_rcp(cross012_123);
const __m128 v012_123_fake = _mm_xor_ps(d01yx_d12yx, vec2x2_perpCCW_xorMask);
const __m128 d01xy_d12xy = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(1, 0, 3, 2));
const __m128 d12xy_d23xy = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(1, 0, 3, 2));
const __m128 d012xy_d123xy = _mm_sub_ps(d01xy_d12xy, d12xy_d23xy);
const __m128 v012_123_true = _mm_mul_ps(d012xy_d123xy, inv_cross012_123);
const __m128 cross_gt_eps = _mm_cmpge_ps(cross012_123, xmm_epsilon);
const __m128 v012_123_true_masked = _mm_and_ps(cross_gt_eps, v012_123_true);
const __m128 v012_123_fake_masked = _mm_andnot_ps(cross_gt_eps, v012_123_fake);
v012_123 = _mm_or_ps(v012_123_true_masked, v012_123_fake_masked);
}
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa); // { v012.x * aa, v012.y * aa, v123.x * aa, v123.y * aa }
const __m128 posEdge = _mm_add_ps(p12, v012_v123_aa); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p2.x + v123.x * aa, p2.y + v123.y * aa }
const __m128 negEdge = _mm_sub_ps(p12, v012_v123_aa); // { p1.x - v012.x * aa, p1.y - v012.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa }
const __m128 packed0 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(1, 0, 1, 0)); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p1.x - v012.x * aa, p1.y - v012.y * aa }
const __m128 packed1 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(3, 2, 3, 2)); // { p2.x + v123.x * aa, p2.y + v123.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa }
// Aligned stores because dstPos is 16-byte aligned
_mm_store_ps(dstPos, packed0);
_mm_store_ps(dstPos + 4, packed1);
dstPos += 8;
srcPos += 4;
d01 = d23;
p1 = _mm_movehl_ps(p23, p23);
}
#if TEST_SIMD == 2 && INSERT_IACA_MARKERS
IACA_VC64_END;
#endif
const uint32_t rem = (lastVertexID & 1);
if (rem) {
const __m128 p2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)srcPos);
const __m128 d12 = _mm_vec2_dir(p1, p2);
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, d12), xmm_aa);
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa));
_mm_store_ps(dstPos, packed);
dstPos += 4;
srcPos += 2;
d01 = d12;
p1 = p2;
}
// Last segment
{
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, _mm_vec2_dir(p1, vtx0)), xmm_aa);
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa));
_mm_storeu_ps(dstPos, packed);
}
stroker->m_NumVertices += numDrawVertices;
}
#endif // CREATE_VERTEX_BUFFER
#if CREATE_INDEX_BUFFER
// Index buffer
{
expandIB(stroker, numDrawIndices);
uint16_t* dstIndex = stroker->m_IndexBuffer;
// First fringe quad
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3;
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2;
dstIndex += 6;
const uint32_t numFanTris = numVertices - 2;
uint16_t secondTriVertex = 2;
for (uint32_t i = 0; i < numFanTris; ++i) {
const uint16_t id0 = secondTriVertex;
const uint16_t id1 = secondTriVertex + 1;
const uint16_t id2 = secondTriVertex + 2;
const uint16_t id3 = secondTriVertex + 3;
// Fan triangle
dstIndex[0] = 0;
dstIndex[1] = id0;
dstIndex[2] = id2;
// Fringe quad
dstIndex[3] = id0;
dstIndex[4] = id1;
dstIndex[5] = id3;
dstIndex[6] = id0;
dstIndex[7] = id3;
dstIndex[8] = id2;
dstIndex += 9;
secondTriVertex += 2;
}
// Last fringe quad
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1);
dstIndex[0] = lastID;
dstIndex[1] = lastID + 1;
dstIndex[2] = 1;
dstIndex[3] = lastID;
dstIndex[4] = 1;
dstIndex[5] = 0;
stroker->m_NumIndices += numDrawIndices;
}
#endif
}
static void strokerConvexFillAA_SIMD3(Stroker* stroker, const float* vertexList, uint32_t numVertices)
{
const uint32_t lastVertexID = numVertices - 1;
const __m128 vtx0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)vertexList);
const __m128 vtx1 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 2));
const __m128 vtx2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 4));
const float cross = _mm_vec2_cross(_mm_sub_ps(vtx1, vtx0), _mm_sub_ps(vtx2, vtx0));
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross);
const __m128 xmm_aa = _mm_set_ps1(aa);
const uint32_t numTris =
(numVertices - 2) + // Triangle fan
(numVertices * 2); // AA fringes
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point.
const uint32_t numDrawIndices = numTris * 3;
resetGeometry(stroker);
#if CREATE_VERTEX_BUFFER
// Vertex buffer
{
expandVB(stroker, numDrawVertices);
const __m128 vtxLast = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + (lastVertexID << 1)));
__m128 d01 = _mm_vec2_dir(vtxLast, vtx0);
__m128 p1 = vtx0;
const float* srcPos = vertexList + 2;
float* dstPos = &stroker->m_PosBuffer->x;
const __m128 xmm_epsilon = _mm_set_ps1(VG_EPSILON);
const __m128 vec2x2_perpCCW_xorMask = _mm_castsi128_ps(_mm_set_epi32(0, 0x80000000, 0, 0x80000000));
const uint32_t numIter = lastVertexID >> 2;
for (uint32_t i = 0; i < numIter; ++i) {
// Load 4 points. With p1 from previous loop iteration make up 4 segments
const __m128 p23 = _mm_loadu_ps(srcPos); // { p2.x, p2.y, p3.x, p3.y }
const __m128 p45 = _mm_loadu_ps(srcPos + 4); // { p4.x, p4.y, p5.x, p5.y }
const __m128 p12 = _mm_movelh_ps(p1, p23); // { p1.x, p1.y, p2.x, p2.y }
const __m128 p34 = _mm_movelh_ps(_mm_movehl_ps(p23, p23), p45); // { p3.x, p3.y, p4.x, p4.y }
// Calculate the direction vector of the 4 segments
// NOTE: Tried to calc all 4 rsqrt in 1 call but it ends up being slower. Kept this version for now.
const __m128 d12_23_unorm = _mm_sub_ps(p23, p12);
const __m128 d34_45_unorm = _mm_sub_ps(p45, p34);
const __m128 d12_23_xy_sqr = _mm_mul_ps(d12_23_unorm, d12_23_unorm);
const __m128 d34_45_xy_sqr = _mm_mul_ps(d34_45_unorm, d34_45_unorm);
const __m128 d12_23_yx_sqr = _mm_shuffle_ps(d12_23_xy_sqr, d12_23_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 d34_45_yx_sqr = _mm_shuffle_ps(d34_45_xy_sqr, d34_45_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 len12_23_sqr = _mm_add_ps(d12_23_xy_sqr, d12_23_yx_sqr);
const __m128 len34_45_sqr = _mm_add_ps(d34_45_xy_sqr, d34_45_yx_sqr);
const __m128 lenSqr123_ge_eps = _mm_cmpge_ps(len12_23_sqr, xmm_epsilon);
const __m128 lenSqr345_ge_eps = _mm_cmpge_ps(len34_45_sqr, xmm_epsilon);
const __m128 invLen12_23 = xmm_rsqrt(len12_23_sqr);
const __m128 invLen34_45 = xmm_rsqrt(len34_45_sqr);
const __m128 invLen12_23_masked = _mm_and_ps(invLen12_23, lenSqr123_ge_eps);
const __m128 invLen34_45_masked = _mm_and_ps(invLen34_45, lenSqr345_ge_eps);
const __m128 d12_23 = _mm_mul_ps(d12_23_unorm, invLen12_23_masked);
const __m128 d34_45 = _mm_mul_ps(d34_45_unorm, invLen34_45_masked);
// Calculate the 4 extrusion vectors for the 4 points based on the equ
// abs(cross(d12, d01) > epsilon ? ((d01 - d12) / cross(d12, d01)) : rot90CCW(d01)
const __m128 v012_123_fake = _mm_xor_ps(_mm_shuffle_ps(d01, d12_23, _MM_SHUFFLE(0, 1, 0, 1)), vec2x2_perpCCW_xorMask);
const __m128 v234_345_fake = _mm_xor_ps(_mm_shuffle_ps(d12_23, d34_45, _MM_SHUFFLE(0, 1, 2, 3)), vec2x2_perpCCW_xorMask);
// cross012 = d12.x * d01.y - d12.y * d01.x
// cross123 = d23.x * d12.y - d23.y * d12.x
// cross234 = d34.x * d23.y - d34.y * d23.x
// cross345 = d45.x * d34.y - d45.y * d34.x
const __m128 dxy01_12 = _mm_shuffle_ps(d01, d12_23, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 dxy12_23 = d12_23;
const __m128 dxy23_34 = _mm_shuffle_ps(d12_23, d34_45, _MM_SHUFFLE(1, 0, 3, 2));
const __m128 dxy34_45 = d34_45;
const __m128 dx01_12_23_34 = _mm_shuffle_ps(dxy01_12, dxy23_34, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 dy01_12_23_34 = _mm_shuffle_ps(dxy01_12, dxy23_34, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 dx12_23_34_45 = _mm_shuffle_ps(dxy12_23, dxy34_45, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 dy12_23_34_45 = _mm_shuffle_ps(dxy12_23, dxy34_45, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 crossx012_123_234_345 = _mm_mul_ps(dx12_23_34_45, dy01_12_23_34);
const __m128 crossy012_123_234_345 = _mm_mul_ps(dy12_23_34_45, dx01_12_23_34);
const __m128 cross012_123_234_345 = _mm_sub_ps(crossx012_123_234_345, crossy012_123_234_345);
const __m128 inv_cross012_123_234_345 = xmm_rcp(cross012_123_234_345);
const __m128 cross_gt_eps012_123_234_345 = _mm_cmpge_ps(cross012_123_234_345, xmm_epsilon);
const __m128 inv_cross012_123 = _mm_shuffle_ps(inv_cross012_123_234_345, inv_cross012_123_234_345, _MM_SHUFFLE(1, 1, 0, 0));
const __m128 inv_cross234_345 = _mm_shuffle_ps(inv_cross012_123_234_345, inv_cross012_123_234_345, _MM_SHUFFLE(3, 3, 2, 2));
const __m128 cross012_123_gt_eps = _mm_shuffle_ps(cross_gt_eps012_123_234_345, cross_gt_eps012_123_234_345, _MM_SHUFFLE(1, 1, 0, 0));
const __m128 cross234_345_gt_eps = _mm_shuffle_ps(cross_gt_eps012_123_234_345, cross_gt_eps012_123_234_345, _MM_SHUFFLE(3, 3, 2, 2));
const __m128 dxy012_123 = _mm_sub_ps(dxy01_12, dxy12_23);
const __m128 dxy234_345 = _mm_sub_ps(dxy23_34, dxy34_45);
const __m128 v012_123_true = _mm_mul_ps(dxy012_123, inv_cross012_123);
const __m128 v234_345_true = _mm_mul_ps(dxy234_345, inv_cross234_345);
const __m128 v012_123_true_masked = _mm_and_ps(cross012_123_gt_eps, v012_123_true);
const __m128 v234_345_true_masked = _mm_and_ps(cross234_345_gt_eps, v234_345_true);
const __m128 v012_123_fake_masked = _mm_andnot_ps(cross012_123_gt_eps, v012_123_fake);
const __m128 v245_345_fake_masked = _mm_andnot_ps(cross234_345_gt_eps, v234_345_fake);
const __m128 v012_123 = _mm_or_ps(v012_123_true_masked, v012_123_fake_masked);
const __m128 v234_345 = _mm_or_ps(v234_345_true_masked, v245_345_fake_masked);
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa);
const __m128 v234_v345_aa = _mm_mul_ps(v234_345, xmm_aa);
// Calculate the 2 fringe points for each of p1, p2, p3 and p4
const __m128 posEdge12 = _mm_add_ps(p12, v012_v123_aa);
const __m128 negEdge12 = _mm_sub_ps(p12, v012_v123_aa);
const __m128 posEdge34 = _mm_add_ps(p34, v234_v345_aa);
const __m128 negEdge34 = _mm_sub_ps(p34, v234_v345_aa);
const __m128 p1_in_out = _mm_shuffle_ps(posEdge12, negEdge12, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 p2_in_out = _mm_shuffle_ps(posEdge12, negEdge12, _MM_SHUFFLE(3, 2, 3, 2));
const __m128 p3_in_out = _mm_shuffle_ps(posEdge34, negEdge34, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 p4_in_out = _mm_shuffle_ps(posEdge34, negEdge34, _MM_SHUFFLE(3, 2, 3, 2));
// Store the fringe points
_mm_store_ps(dstPos + 0, p1_in_out);
_mm_store_ps(dstPos + 4, p2_in_out);
_mm_store_ps(dstPos + 8, p3_in_out);
_mm_store_ps(dstPos + 12, p4_in_out);
// Move on to the next iteration.
d01 = _mm_movehl_ps(d34_45, d34_45);
p1 = _mm_movehl_ps(p45, p45); // p1 = p5
srcPos += 8;
dstPos += 16;
}
uint32_t rem = (lastVertexID & 3);
if (rem >= 2) {
const __m128 p23 = _mm_loadu_ps(srcPos);
const __m128 p12 = _mm_movelh_ps(p1, p23);
const __m128 d12_23 = _mm_sub_ps(p23, p12);
const __m128 d12_23_xy_sqr = _mm_mul_ps(d12_23, d12_23);
const __m128 d12_23_yx_sqr = _mm_shuffle_ps(d12_23_xy_sqr, d12_23_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 len12_23_sqr = _mm_add_ps(d12_23_xy_sqr, d12_23_yx_sqr);
const __m128 lenSqr_ge_eps = _mm_cmpge_ps(len12_23_sqr, xmm_epsilon);
const __m128 invLen12_23 = xmm_rsqrt(len12_23_sqr);
const __m128 invLen12_23_masked = _mm_and_ps(invLen12_23, lenSqr_ge_eps);
const __m128 d12_23_norm = _mm_mul_ps(d12_23, invLen12_23_masked);
const __m128 d12 = _mm_movelh_ps(d12_23_norm, d12_23_norm);
const __m128 d23 = _mm_movehl_ps(d12_23_norm, d12_23_norm);
const __m128 d12xy_d01xy = _mm_movelh_ps(d12, d01);
const __m128 d23xy_d12xy = _mm_movelh_ps(d23, d12);
const __m128 d01yx_d12yx = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(0, 1, 2, 3));
const __m128 d12yx_d23yx = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(0, 1, 2, 3));
const __m128 d12xd01y_d12yd01x = _mm_mul_ps(d12xy_d01xy, d01yx_d12yx);
const __m128 d23xd12y_d23yd12x = _mm_mul_ps(d23xy_d12xy, d12yx_d23yx);
const __m128 d12yd01x_d23yd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 d12xd01y_d23xd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 cross012_123 = _mm_sub_ps(d12xd01y_d23xd12x, d12yd01x_d23yd12x);
const __m128 inv_cross012_123 = xmm_rcp(cross012_123);
const __m128 v012_123_fake = _mm_xor_ps(d01yx_d12yx, vec2x2_perpCCW_xorMask);
const __m128 d01xy_d12xy = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(1, 0, 3, 2));
const __m128 d12xy_d23xy = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(1, 0, 3, 2));
const __m128 d012xy_d123xy = _mm_sub_ps(d01xy_d12xy, d12xy_d23xy);
const __m128 v012_123_true = _mm_mul_ps(d012xy_d123xy, inv_cross012_123);
const __m128 cross_gt_eps = _mm_cmpge_ps(cross012_123, xmm_epsilon);
const __m128 v012_123_true_masked = _mm_and_ps(cross_gt_eps, v012_123_true);
const __m128 v012_123_fake_masked = _mm_andnot_ps(cross_gt_eps, v012_123_fake);
const __m128 v012_123 = _mm_or_ps(v012_123_true_masked, v012_123_fake_masked);
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa);
const __m128 posEdge = _mm_add_ps(p12, v012_v123_aa);
const __m128 negEdge = _mm_sub_ps(p12, v012_v123_aa);
const __m128 packed0 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(1, 0, 1, 0));
const __m128 packed1 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(3, 2, 3, 2));
_mm_store_ps(dstPos, packed0);
_mm_store_ps(dstPos + 4, packed1);
dstPos += 8;
srcPos += 4;
d01 = d23;
p1 = _mm_movehl_ps(p23, p23);
rem -= 2;
}
if (rem) {
const __m128 p2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)srcPos);
const __m128 d12 = _mm_vec2_dir(p1, p2);
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, d12), xmm_aa);
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa));
_mm_store_ps(dstPos, packed);
dstPos += 4;
srcPos += 2;
d01 = d12;
p1 = p2;
}
// Last segment
{
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, _mm_vec2_dir(p1, vtx0)), xmm_aa);
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa));
_mm_storeu_ps(dstPos, packed);
}
stroker->m_NumVertices += numDrawVertices;
}
#endif // CREATE_VERTEX_BUFFER
#if CREATE_INDEX_BUFFER
// Index buffer
{
expandIB(stroker, numDrawIndices);
uint16_t* dstIndex = stroker->m_IndexBuffer;
// First fringe quad
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3;
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2;
dstIndex += 6;
const uint32_t numFanTris = numVertices - 2;
#if !SIMD_INDEX_BUFFER
uint16_t secondTriVertex = 2;
for (uint32_t i = 0; i < numFanTris; ++i) {
const uint16_t id0 = secondTriVertex;
const uint16_t id1 = secondTriVertex + 1;
const uint16_t id2 = secondTriVertex + 2;
const uint16_t id3 = secondTriVertex + 3;
// Fan triangle
dstIndex[0] = 0;
dstIndex[1] = id0;
dstIndex[2] = id2;
// Fringe quad
dstIndex[3] = id0;
dstIndex[4] = id1;
dstIndex[5] = id3;
dstIndex[6] = id0;
dstIndex[7] = id3;
dstIndex[8] = id2;
dstIndex += 9;
secondTriVertex += 2;
}
#else
#if 0
static const uint16_t delta0[8] = { 0, 2, 0, 1, 3, 0, 3, 2 };
static const uint16_t delta1[8] = { 2, 4, 2, 3, 5, 2, 5, 4 };
static const uint16_t delta2[8] = { 4, 6, 4, 5, 7, 4, 7, 6 };
static const uint16_t delta3[8] = { 6, 8, 6, 7, 9, 6, 9, 8 };
const __m128i xmm_delta0 = _mm_loadu_si128((const __m128i*)delta0);
const __m128i xmm_delta1 = _mm_loadu_si128((const __m128i*)delta1);
const __m128i xmm_delta2 = _mm_loadu_si128((const __m128i*)delta2);
const __m128i xmm_delta3 = _mm_loadu_si128((const __m128i*)delta3);
const __m128i xmm_stv_delta = _mm_set1_epi16(8);
__m128i xmm_stv = _mm_set1_epi16(2);
const uint32_t numIter = numFanTris >> 2;
for (uint32_t i = 0; i < numIter; ++i) {
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS
IACA_VC64_START;
#endif
// { 0, stv + 0, stv + 2, stv + 0, stv + 1, stv + 3, stv + 0, stv + 3 }
// { stv + 2, 0, stv + 2, stv + 4, stv + 2, stv + 3, stv + 5, stv + 2 }
// { stv + 5, stv + 4, 0, stv + 4, stv + 6, stv + 4, stv + 5, stv + 7 }
// { stv + 4, stv + 7, stv + 6, 0, stv + 6, stv + 8, stv + 6, stv + 7 }
// { stv + 9, stv + 6, stv + 9, stv + 8 }
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); // { stv + 0, stv + 2, stv + 0, stv + 1, stv + 3, stv + 0, stv + 3, stv + 2 }
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1); // { stv + 2, stv + 4, stv + 2, stv + 3, stv + 5, stv + 2, stv + 5, stv + 4 }
const __m128i xmm_id2 = _mm_add_epi16(xmm_stv, xmm_delta2); // { stv + 4, stv + 6, stv + 4, stv + 5, stv + 7, stv + 4, stv + 7, stv + 6 }
const __m128i xmm_id3 = _mm_add_epi16(xmm_stv, xmm_delta3); // { stv + 6, stv + 8, stv + 6, stv + 7, stv + 9, stv + 6, stv + 9, stv + 8 }
dstIndex[0] = 0;
dstIndex[9] = 0;
dstIndex[18] = 0;
dstIndex[27] = 0;
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0);
_mm_storeu_si128((__m128i*)(dstIndex + 10), xmm_id1);
_mm_storeu_si128((__m128i*)(dstIndex + 19), xmm_id2);
_mm_storeu_si128((__m128i*)(dstIndex + 28), xmm_id3);
dstIndex += 36;
xmm_stv = _mm_add_epi16(xmm_stv, xmm_stv_delta);
}
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS
IACA_VC64_END;
#endif
uint32_t rem = numFanTris & 3;
if (rem >= 2) {
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0);
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1);
dstIndex[0] = 0;
dstIndex[9] = 0;
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0);
_mm_storeu_si128((__m128i*)(dstIndex + 10), xmm_id1);
dstIndex += 18;
xmm_stv = _mm_add_epi16(xmm_stv, _mm_set1_epi16(4));
rem -= 2;
}
if (rem) {
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0);
dstIndex[0] = 0;
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0);
dstIndex += 9;
}
#else
__m128i xmm_stv = _mm_set1_epi16(2);
{
static const uint16_t delta0[8] = { 0, 0, 2, 0, 1, 3, 0, 3 };
static const uint16_t delta1[8] = { 2, 0, 2, 4, 2, 3, 5, 2 };
static const uint16_t delta2[8] = { 5, 4, 0, 4, 6, 4, 5, 7 };
static const uint16_t delta3[8] = { 4, 7, 6, 0, 6, 8, 6, 7 };
static const uint16_t delta4[8] = { 9, 6, 9, 8, 0, 0, 0, 0 };
const __m128i xmm_delta0 = _mm_loadu_si128((const __m128i*)delta0);
const __m128i xmm_delta1 = _mm_loadu_si128((const __m128i*)delta1);
const __m128i xmm_delta2 = _mm_loadu_si128((const __m128i*)delta2);
const __m128i xmm_delta3 = _mm_loadu_si128((const __m128i*)delta3);
const __m128i xmm_delta4 = _mm_loadu_si128((const __m128i*)delta4);
const __m128i xmm_stv_delta = _mm_set1_epi16(8);
const uint32_t numIter = numFanTris >> 2;
for (uint32_t i = 0; i < numIter; ++i) {
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS
IACA_VC64_START;
#endif
// { 0, stv + 0, stv + 2, stv + 0, stv + 1, stv + 3, stv + 0, stv + 3 }
// { stv + 2, 0, stv + 2, stv + 4, stv + 2, stv + 3, stv + 5, stv + 2 }
// { stv + 5, stv + 4, 0, stv + 4, stv + 6, stv + 4, stv + 5, stv + 7 }
// { stv + 4, stv + 7, stv + 6, 0, stv + 6, stv + 8, stv + 6, stv + 7 }
// { stv + 9, stv + 6, stv + 9, stv + 8 }
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0);
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1);
const __m128i xmm_id2 = _mm_add_epi16(xmm_stv, xmm_delta2);
const __m128i xmm_id3 = _mm_add_epi16(xmm_stv, xmm_delta3);
const __m128i xmm_id4 = _mm_add_epi16(xmm_stv, xmm_delta4);
_mm_storeu_si128((__m128i*)(dstIndex + 0), _mm_insert_epi16(xmm_id0, 0, 0));
_mm_storeu_si128((__m128i*)(dstIndex + 8), _mm_insert_epi16(xmm_id1, 0, 1));
_mm_storeu_si128((__m128i*)(dstIndex + 16), _mm_insert_epi16(xmm_id2, 0, 2));
_mm_storeu_si128((__m128i*)(dstIndex + 24), _mm_insert_epi16(xmm_id3, 0, 3));
_mm_storel_epi64((__m128i*)(dstIndex + 32), xmm_id4);
dstIndex += 36;
xmm_stv = _mm_add_epi16(xmm_stv, xmm_stv_delta);
}
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS
IACA_VC64_END;
#endif
}
{
static const uint16_t delta0[8] = { 0, 2, 0, 1, 3, 0, 3, 2 };
static const uint16_t delta1[8] = { 2, 4, 2, 3, 5, 2, 5, 4 };
const __m128i xmm_delta0 = _mm_loadu_si128((const __m128i*)delta0);
const __m128i xmm_delta1 = _mm_loadu_si128((const __m128i*)delta1);
uint32_t rem = numFanTris & 3;
if (rem >= 2) {
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0);
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1);
dstIndex[0] = 0;
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0);
dstIndex[9] = 0;
_mm_storeu_si128((__m128i*)(dstIndex + 10), xmm_id1);
dstIndex += 18;
xmm_stv = _mm_add_epi16(xmm_stv, _mm_set1_epi16(4));
rem -= 2;
}
if (rem) {
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0);
dstIndex[0] = 0;
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0);
dstIndex += 9;
}
}
#endif // 0
#endif // SIMD_INDEX_BUFFER
// Last fringe quad
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1);
dstIndex[0] = lastID;
dstIndex[1] = lastID + 1;
dstIndex[2] = 1;
dstIndex[3] = lastID;
dstIndex[4] = 1;
dstIndex[5] = 0;
stroker->m_NumIndices += numDrawIndices;
}
#endif
}
static void generatePath(float* vtx, uint32_t numVertices)
{
// Circle
const float cx = 0.0f;
const float cy = 0.0f;
const float r = 100.0f;
vtx[0] = cx + r;
vtx[1] = cy;
vtx += 2;
const float dtheta = -(2.0f * PI) / (float)numVertices;
const float cos_dtheta = cosf(dtheta);
const float sin_dtheta = sinf(dtheta);
float ca = 1.0f;
float sa = 0.0f;
for (uint32_t i = 1; i < numVertices; ++i) {
const float nextSin = sin_dtheta * ca + cos_dtheta * sa;
const float nextCos = cos_dtheta * ca - sin_dtheta * sa;
ca = nextCos;
sa = nextSin;
vtx[0] = cx + r * ca;
vtx[1] = cy + r * sa;
vtx += 2;
}
}
static inline int64_t getHPCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
int64_t i64 = li.QuadPart;
return i64;
}
static inline int64_t getHPFrequency()
{
LARGE_INTEGER li;
QueryPerformanceFrequency(&li);
return li.QuadPart;
}
int main()
{
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
SetThreadAffinityMask(GetCurrentThread(), 0x00000001);
float* vertexList = (float*)_aligned_malloc(sizeof(float) * 2 * NUM_VERTICES, 16);
generatePath(vertexList, NUM_VERTICES);
Stroker stroker;
memset(&stroker, 0, sizeof(Stroker));
stroker.m_FringeWidth = 1.0f;
#if TEST_SIMD == 1
strokerConvexFillAA_SIMD(&stroker, vertexList, NUM_VERTICES);
#elif TEST_SIMD == 2
strokerConvexFillAA_SIMD2(&stroker, vertexList, NUM_VERTICES);
#elif TEST_SIMD == 3
strokerConvexFillAA_SIMD3(&stroker, vertexList, NUM_VERTICES);
#else
strokerConvexFillAA(&stroker, vertexList, NUM_VERTICES);
#endif
int64_t start = getHPCounter();
for (uint32_t i = 0; i < NUM_ITERATIONS; ++i) {
#if TEST_SIMD == 1
strokerConvexFillAA_SIMD(&stroker, vertexList, NUM_VERTICES);
#elif TEST_SIMD == 2
strokerConvexFillAA_SIMD2(&stroker, vertexList, NUM_VERTICES);
#elif TEST_SIMD == 3
strokerConvexFillAA_SIMD3(&stroker, vertexList, NUM_VERTICES);
#else
strokerConvexFillAA(&stroker, vertexList, NUM_VERTICES);
#endif
}
int64_t elapsed = getHPCounter() - start;
printf("Elapsed (raw): %" PRId64 "\n", elapsed);
const int64_t freq = getHPFrequency();
double elapsed_msec = 1000.0 * (double)elapsed / (double)freq;
printf("Elapsed time: %f msec (%f usec / call)\n", elapsed_msec, elapsed_msec * 1000.0 / (double)NUM_ITERATIONS);
#if CREATE_VERTEX_BUFFER && TEST_SIMD && VERIFY_SIMD
{
printf("Veryfing SIMD vertex buffer...\n");
Stroker strokerRef;
memset(&strokerRef, 0, sizeof(Stroker));
strokerRef.m_FringeWidth = 1.0f;
strokerConvexFillAA(&strokerRef, vertexList, NUM_VERTICES);
double maxError = -1.0;
const float* simdVtx = &stroker.m_PosBuffer->x;
const float* refVtx = &strokerRef.m_PosBuffer->x;
for (uint32_t i = 0; i < stroker.m_NumVertices; ++i) {
const double dx = (double)simdVtx[0] - (double)refVtx[0];
const double dy = (double)simdVtx[1] - (double)refVtx[1];
const double len = dx * dx + dy * dy;
if (len > maxError) {
maxError = len;
}
simdVtx += 2;
refVtx += 2;
}
printf("- Max error: %g\n", maxError);
}
#endif
#if CREATE_INDEX_BUFFER && TEST_SIMD && VERIFY_SIMD
{
printf("Veryfing SIMD index buffer...\n");
Stroker strokerRef;
memset(&strokerRef, 0, sizeof(Stroker));
strokerRef.m_FringeWidth = 1.0f;
strokerConvexFillAA(&strokerRef, vertexList, NUM_VERTICES);
double maxError = 0.0;
const uint16_t* simdID = stroker.m_IndexBuffer;
const uint16_t* refID = strokerRef.m_IndexBuffer;
bool err = false;
for (uint32_t i = 0; i < stroker.m_NumIndices; ++i) {
if (simdID[i] != refID[i]) {
printf("Index %d is wrong!\n", i);
err = true;
break;
}
}
if (!err) {
printf("SIMD index buffer is correct\n");
}
}
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment