Skip to content

Instantly share code, notes, and snippets.

@Mischa-Alff
Last active July 7, 2024 01:03
Show Gist options
  • Save Mischa-Alff/4fdb4e2e545212cfcbda to your computer and use it in GitHub Desktop.
Save Mischa-Alff/4fdb4e2e545212cfcbda to your computer and use it in GitHub Desktop.
Simple AABB implementation using SIMD instructions for x86-64
#include <chrono>
#include <ctime>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <random>
struct AABB{int32_t left, top, right, bottom;};
extern "C" bool check_aabb(AABB *a, AABB *b);
extern "C" bool check_aabb_conventional(AABB *a, AABB *b);
bool check_aabb_c(AABB *a, AABB *b);
#include <vector>
#include <algorithm>
const int aabbAmount = 10000;
const int aabbMinPos = 0;
const int aabbMaxPos = 1000000;
const int aabbMinSize = 1;
const int aabbMaxSize = 1000000;
void benchmark();
void test();
int main()
{
benchmark();
}
void benchmark()
{
AABB* aabbs = new AABB[aabbAmount];
std::default_random_engine generator;
std::uniform_int_distribution<int32_t> positionDistribution(aabbMinPos, aabbMaxPos);
std::uniform_int_distribution<int32_t> sizeDistribution(aabbMinSize, aabbMaxSize);
for(int32_t i = 0; i < aabbAmount; ++i)
{
aabbs[i].left = positionDistribution(generator);
aabbs[i].top = positionDistribution(generator);
aabbs[i].right = aabbs[i].left + sizeDistribution(generator);
aabbs[i].bottom = aabbs[i].top + sizeDistribution(generator);
};
std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
for(int32_t i = 0; i < aabbAmount; ++i)
{
for(int32_t j = i + 1; j < aabbAmount; ++j)
{
bool collision = check_aabb_c(&aabbs[i], &aabbs[j]);
collision = check_aabb_c(&aabbs[j], &aabbs[i]);
}
}
auto t = std::chrono::high_resolution_clock::now()-start;
std::cout<<std::setw(25)<<"Conventional (C): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;
start = std::chrono::high_resolution_clock::now();
for(int32_t i = 0; i < aabbAmount; ++i)
{
for(int32_t j = i + 1; j < aabbAmount; ++j)
{
bool collision = check_aabb_conventional(&aabbs[i], &aabbs[j]);
collision = check_aabb_conventional(&aabbs[j], &aabbs[i]);
}
}
t = std::chrono::high_resolution_clock::now()-start;
std::cout<<std::setw(25)<<"Conventional (ASM): "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;
start = std::chrono::high_resolution_clock::now();
for(int32_t i = 0; i < aabbAmount; ++i)
{
for(int32_t j = i + 1; j < aabbAmount; ++j)
{
bool collision = check_aabb(&aabbs[i], &aabbs[j]);
collision = check_aabb(&aabbs[j], &aabbs[i]);
}
}
t = std::chrono::high_resolution_clock::now()-start;
std::cout<<std::setw(25)<<"SIMD: "<<std::chrono::duration_cast<std::chrono::milliseconds>(t).count()<<std::endl;
std::cout<<"These results are not trustworthy due to loop and other overhead. Just use them to see \"which is faster\" instead of as a real performance metric. Use callgrind for that."<<std::endl;
}
bool check_aabb_c(AABB *a, AABB *b)
{
return !((b->left)>(a->right) || (b->right)<(a->left) || (b->top)>(a->bottom) || (b->bottom)<(a->top));
}
; AABB.nasm (c) 2015 Mischa "Aster" Alff
; x86-64 Assembly implementations of AABB intersection checks.
; The algorithm in use here is:
; !(B.left>A.right || B.right<A.left || B.top>A.bottom || B.bottom<A.top)
section .data
section .bss
aabb:
.left: resd 1
.top: resd 1
.right: resd 1
.bottom: resd 1
section .text
global check_aabb
global check_aabb_conventional
; check_aabb: calculates intersection between two AABBs using SSE
; Notes: This routine runs 2.66 times faster than `check_aabb_conventional`
; using the yasm assembler on x86-64 (Intel Haswell i7-4770)
; Takes two arguments: *AABB and *AABB, in rdi and rsi respectively.
check_aabb:
; load the first AABB from memory into xmm0 as four 32-bit signed ints
movdqa xmm0, [rdi]
; load the second AABB from memory into xmm1 as four 32-bit signed ints
movdqa xmm1, [rsi]
; shuffle the second AABB so that top/bottom and left/right are inverted
shufps xmm1, xmm1, 01001110b
; compare each signed 32-bit integer in xmm1 to the ones in xmm0 and
; store the result in xmm1 as a bit mask
; (0xFFFFFFFF for true, 0x0 for false)
pcmpgtd xmm1, xmm0
; obtain that bitmask in a two-byte register (we use rax here because it
; zeroes out all the other bits)
pmovmskb rax, xmm1
; invert the greater-than for two members to make them
; less-than-or-equal-to (yes, this is not as intended, but since this
; is a PoC and will probably be adapted for floating point, nobody
; cares whether it's > or >= for now.)
xor eax, 0x00FF
; Convert our bitmask to a boolean, and not it:
; If the result of the xor is zero, the zero flag is 1, and thus al
; is 1 (and thus rax is 1)
; If not, the ZF is 0, and thus al is 0, and rax is 0.
setz al
; Still not sure if this is required, but better be safe than sorry,
; I guess. (Yes, C++ treats >0 as true, but I want a true logical 0/1)
and eax, 0x1
ret
; check_aabb_conventional : calculates intersection between two AABBs using
; conventional methods
; Notes: This is a naïve port of the C version mentioned above
; Takes two arguments: *AABB and *AABB, in rdi and rsi respectively.
check_aabb_conventional:
; set ecx to 0, as we'll be using it for our OR accumulator
xor ecx, ecx
; load B.left into eax, A.right into ebx, and compare for greater-than
mov eax, dword [rdi + (aabb.left - aabb)]
mov ebx, dword [rsi + (aabb.right - aabb)]
cmp eax, ebx
; place the result of the comparison in cl
setg cl
; ... repeat
mov eax, dword [rdi + (aabb.right - aabb)]
mov ebx, dword [rsi + (aabb.left - aabb)]
cmp eax, ebx
; place the result of the comparison in ch
setle ch
; or cl and ch, repeat
or cl, ch
mov eax, dword [rdi + (aabb.top - aabb)]
mov ebx, dword [rsi + (aabb.bottom - aabb)]
cmp eax, ebx
setg ch
or cl, ch
mov eax, dword [rdi + (aabb.bottom - aabb)]
mov ebx, dword [rsi + (aabb.top - aabb)]
cmp eax, ebx
setle ch
or cl, ch
; convert to logical bool and NOT.
not ecx
and ecx, 1
mov eax, ecx
ret
yasm -f elf64 AABB.nasm -o AABB.o -l AABB.lst
g++ AABB.cpp -c -o AABB.cpp.o -std=c++11
g++ AABB.cpp.o AABB.o -o AABB
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment