Skip to content

Instantly share code, notes, and snippets.

@roxlu roxlu/info.txt Secret
Last active Dec 27, 2015

Embed
What would you like to do?
Researching SIMD optimizations for basic particle simulation (gamedev), what structure to use:
- Structure of Arrays (SoA)
- Array of Structures (AoS)
- Array of Structure of Arrays <-- testing this approach (AoSoA)
I've been told that AoSoA is the most optimal way to implement a basic particle system with SIMD. Though
the current code, as shown below proves otherwise; when using a SoA solution I'm measuring 0.060m, but when
using AoSoA I'm getting 0.16ms which is insanely slow.
I'm pasting a perf report below which shows that there is a huge performance penalty of 13.48% and 17.36%
ignore the 26.58% as this is a initialization thing.
Any ideas why this AoSoA solution is so slow? and how to optimize?
#include <uv.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <immintrin.h>
#define USE_SIMD256_INTERLEAVED 0
#define USE_SIMD128_INTERLEAVED 1
#define NTESTS 10
#define NLOOPS 3000
#define CACHE_LINE_SIZE 4096
#define PARTICLE_COUNT (100*1000)
#if USE_SIMD256_INTERLEAVED /* one particle struct contains the values of 8 particles */
# define GROUP_COUNT (PARTICLE_COUNT / 8)
#elif USE_SIMD128_INTERLEAVED
# define GROUP_COUNT (PARTICLE_COUNT / 4)
#else
# define GROUP_COUNT PARTICLE_COUNT
#endif
struct particle_group
{
#if USE_SIMD256_INTERLEAVED
__m256 pos_x, pos_y, force_x, force_y, vel_x, vel_y;
#elif USE_SIMD128_INTERLEAVED
__m128 pos_x, pos_y, force_x, force_y, vel_x, vel_y;
#else
float pos_x, pos_y, force_x, force_y, vel_x, vel_y;
#endif
};
particle_group* particles;
void add_force(float x, float y);
void step();
int power_of_two_below(int x)
{
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return (x + 1) >> 1;
}
int main()
{
posix_memalign((void**)&particles, 256, sizeof(particle_group) * GROUP_COUNT);
uint64_t total_time = 0;
for(int k = 0; k < NTESTS; ++k) {
memset((char*)particles, 0x00, sizeof(particle_group) * GROUP_COUNT);
uint64_t start = uv_hrtime();
for(int i = 0; i < NLOOPS; ++i) {
add_force(0.0005, 0.03);
step();
}
uint64_t d = uv_hrtime() - start;
printf("Took: %lld, millis: %f, millis per loop: %f\n", d, double(d)/1000000.0, (double(d)/1000000.0)/float(NLOOPS));
total_time += d;
}
for(int i = 0; i < 10; ++i) {
int dx = i * 2;
// printf("force_x: %f, force_y: %f, pos_x: %f, pos_y: %f\n", forces[dx + 0], forces[dx + 1], positions[dx + 0], positions[dx + 1]);
}
double avg = (double(total_time)/1000000.0)/double(NTESTS);
printf("Avarage: %f, ms: %f\n", avg, avg/float(NLOOPS));
}
// ----------------------------------------------------------------------
/*
prefecth heuristics:
_mm_prefetch is a hint to fill the cache
*/
void add_force(float x, float y)
{
#if USE_SIMD256_INTERLEAVED
__m256 force_x = _mm256_set_ps(x, x, x, x, x, x, x, x);
__m256 force_y = _mm256_set_ps(y, y, y, y, y, y, y, y);
#elif USE_SIMD128_INTERLEAVED
__m128 force_x = _mm_set_ps(x, x, x, x);
__m128 force_y = _mm_set_ps(y, y, y, y);
#endif
int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0]));
for(int i = 0; i < GROUP_COUNT; ++i)
{
#if USE_SIMD256_INTERLEAVED
# error "Need to implement"
if(i % 16)
{
_mm_prefetch(particles + i, _MM_HINT_T0);
_mm_prefetch(particles + i + 16, _MM_HINT_T0);
}
particles[i].force_x = _mm256_add_ps(particles[i].force_x, force_x);
particles[i].force_y = _mm256_add_ps(particles[i].force_y, force_y);
#elif USE_SIMD128_INTERLEAVED
if((i & (prefetch_count - 1)) == 0)
{
_mm_prefetch(particles + i, _MM_HINT_T0);
_mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0);
}
particles[i].force_x = _mm_add_ps(particles[i].force_x, force_x);
particles[i].force_y = _mm_add_ps(particles[i].force_y, force_y);
#else
particles[i].force_x += x;
particles[i].force_y += y;
#endif
}
}
void step()
{
#if USE_SIMD256_INTERLEAVED
__m256 const null_force = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
__m256 const drag = _mm256_set_ps(0.99f, 0.99f, 0.99f, 0.99f, 0.99f, 0.99f, 0.99f, 0.99f);
#elif USE_SIMD128_INTERLEAVED
__m128 const null_force = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
__m128 const drag = _mm_set_ps(0.99f, 0.99f, 0.99f, 0.99f);
#else
float const null_force = 0.0f;
float const drag = 0.99f;
#endif
int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0]));
for (int i = 0; i < GROUP_COUNT; ++i)
{
#if USE_SIMD256_INTERLEAVED
/* TODO */
if((i & (prefetch_count - 1)) == 0)
{
_mm_prefetch(particles + i, _MM_HINT_T0);
_mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0);
}
__m256 vel_x = _mm256_add_ps(particles[i].vel_x, particles[i].force_x);
__m256 vel_y = _mm256_add_ps(particles[i].vel_y, particles[i].force_y);
particles[i].pos_x = _mm256_add_ps(particles[i].pos_x, vel_x);
particles[i].pos_y = _mm256_add_ps(particles[i].pos_y, vel_y);
particles[i].vel_x = _mm256_mul_ps(vel_x, drag);
particles[i].vel_y = _mm256_mul_ps(vel_y, drag);
particles[i].force_x = null_force;
particles[i].force_y = null_force;
#elif USE_SIMD128_INTERLEAVED
if((i & (prefetch_count - 1)) == 0)
{
_mm_prefetch(particles + i, _MM_HINT_T0);
_mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0);
}
__m128 vel_x = _mm_add_ps(particles[i].vel_x, particles[i].force_x);
__m128 vel_y = _mm_add_ps(particles[i].vel_y, particles[i].force_y);
particles[i].pos_x = _mm_add_ps(particles[i].pos_x, vel_x);
particles[i].pos_y = _mm_add_ps(particles[i].pos_y, vel_y);
particles[i].vel_x = _mm_mul_ps(vel_x, drag);
particles[i].vel_y = _mm_mul_ps(vel_y, drag);
particles[i].force_x = null_force;
particles[i].force_y = null_force;
#else
float vel_x = particles[i].vel_x + particles[i].force_x;
float vel_y = particles[i].vel_y + particles[i].force_y;
particles[i].pos_x += vel_x;
particles[i].pos_y += vel_y;
particles[i].vel_x = vel_x * drag;
particles[i].vel_y = vel_y * drag;
particles[i].force_x = null_force;
particles[i].force_y = null_force;
#endif
}
}
│ mov $0x249f00,%edx ▒
│ mov $0x100,%esi ▒
│ push %rbp ▒
│ mov $0x623a00,%edi ▒
│ xor %r12d,%r12d ▒
│ push %rbx ▒
│ mov $0xa,%ebx ▒
│ sub $0x40,%rsp ▒
│ → callq posix_memalign@plt ▒
│ vmovap 0x15fa4(%rip),%xmm6 # 41c630 <_IO_stdin_used+0x50> ▒
│ vxorps %xmm2,%xmm2,%xmm2 ▒
│ vmovap 0x15fa8(%rip),%xmm5 # 41c640 <_IO_stdin_used+0x60> ▒
│ vmovap 0x15fb0(%rip),%xmm3 # 41c650 <_IO_stdin_used+0x70> ▒
│ 40: mov particles,%rdi ▒
│ xor %esi,%esi ▒
│ mov $0x249f00,%edx ▒
│ vmovap %xmm2,0x30(%rsp) ▒
│ vmovap %xmm3,0x20(%rsp) ▒
│ vmovap %xmm5,0x10(%rsp) ▒
│ vmovap %xmm6,(%rsp) ▒
│ → callq memset@plt ◆
│ → callq uv_hrtime ▒
│ mov particles,%rsi ▒
│ vmovap (%rsp),%xmm6 ▒
│ mov %rax,%rbp ▒
│ mov $0xbb8,%eax ▒
│ vmovap 0x10(%rsp),%xmm5 ▒
│ vmovap 0x20(%rsp),%xmm3 ▒
│ vmovap 0x30(%rsp),%xmm2 ▒
│ nop ▒
0.01 │ 98: mov %rsi,%rdx ▒
│ xor %ecx,%ecx ▒
│ ↓ jmp c3 ▒
│ nop ▒
1.58 │ a0: vaddps 0x20(%rdx),%xmm6,%xmm0 ▒
26.58 │ add $0x1,%ecx ▒
1.19 │ add $0x60,%rdx ▒
0.21 │ vmovap %xmm0,-0x40(%rdx) ▒
2.80 │ vaddps -0x30(%rdx),%xmm5,%xmm0 ▒
2.34 │ vmovap %xmm0,-0x30(%rdx) ▒
2.13 │ cmp $0x61a8,%ecx ▒
│ ↓ je d8 ▒
0.23 │ c3: test $0x1f,%cl ▒
│ ↑ jne a0 ▒
0.07 │ prefet (%rdx) ▒
│ prefet 0xc00(%rdx) ▒
0.50 │ ↑ jmp a0 ▒
│ nop ▒
│ d8: mov %rsi,%rdx ▒
│ xor %cx,%cx ▒
│ ↓ jmp 133 ▒
1.23 │ e0: vmovap 0x40(%rdx),%xmm1 ▒
13.48 │ add $0x1,%ecx ▒
1.95 │ add $0x60,%rdx ▒
0.12 │ vmovap -0x10(%rdx),%xmm0 ▒
2.36 │ vaddps -0x40(%rdx),%xmm1,%xmm1 ▒
17.36 │ vmovap %xmm2,-0x40(%rdx) ▒
0.93 │ vaddps -0x30(%rdx),%xmm0,%xmm0 ▒
3.22 │ vmovap %xmm2,-0x30(%rdx) ▒
0.58 │ vaddps -0x60(%rdx),%xmm1,%xmm4 ▒
6.38 │ vmulps %xmm3,%xmm1,%xmm1 ▒
5.32 │ vmovap %xmm4,-0x60(%rdx) ▒
0.62 │ vaddps -0x50(%rdx),%xmm0,%xmm4 ▒
0.67 │ vmulps %xmm3,%xmm0,%xmm0 ▒
3.29 │ vmovap %xmm1,-0x20(%rdx) ▒
1.26 │ vmovap %xmm4,-0x50(%rdx) ▒
0.18 │ vmovap %xmm0,-0x10(%rdx) ▒
2.83 │ cmp $0x61a8,%ecx ▒
│ ↓ je 148 ▒
0.11 │133: test $0x1f,%cl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.