-
-
Save roxlu/c1a61e2b1ca85f843043 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
│ 80: mov %rsi,%rdx ▒ | |
│ xor %ecx,%ecx ▒ | |
│ nop ▒ | |
1.70 │ 88: vaddps (%rdx),%xmm3,%xmm0 ▒ | |
55.13 │ add $0x1,%ecx ▒ | |
0.82 │ add $0x60,%rdx ▒ | |
1.59 │ vmovap %xmm0,-0x60(%rdx) ▒ | |
4.75 │ vaddps -0x50(%rdx),%xmm2,%xmm0 ▒ | |
6.60 │ vmovap %xmm0,-0x50(%rdx) ▒ | |
3.22 │ cmp $0x61a8,%ecx ▒ | |
│ ↑ jne 88 ▒ | |
│ mov %rdi,%rcx ▒ | |
│ xor %edx,%edx ▒ | |
│ ↓ jmp c7 ▒ | |
│ nop ▒ | |
4.06 │ b8: add $0x1,%edx ▒ | |
│ add $0x60,%rcx ▒ | |
│ cmp $0x61a8,%edx ▒ | |
│ ↓ je de ▒ | |
3.10 │ c7: test $0x1,%dl ▒ | |
│ ↑ jne b8 ▒ | |
4.45 │ add $0x1,%edx ▒ | |
│ prefet (%rcx) ▒ | |
14.57 │ add $0x60,%rcx ▒ | |
│ cmp $0x61a8,%edx ▒ | |
│ ↑ jne c7 ▒ | |
0.01 │ de: sub $0x1,%eax ▒ | |
│ ↑ jne 80 ▒ | |
│ vmovap %xmm2,0x10(%rsp) ▒ | |
│ vmovap %xmm3,(%rsp) ▒ | |
│ → callq uv_hrtime ▒ | |
│ vmovap (%rsp),%xmm3 ▒ | |
│ sub %rbp,%rax ▒ | |
│ vmovap 0x10(%rsp),%xmm2 ◆ | |
│ mov %rax,%rbp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct particle_group { | |
__m128 pos_x; | |
__m128 pos_y; | |
__m128 force_x; | |
__m128 force_y; | |
__m128 vel_x; | |
__m128 vel_y; | |
}; | |
void add_force(float x, float y) { | |
__m128 force_x = _mm_set_ps(x, x, x, x); | |
__m128 force_y = _mm_set_ps(y, y, y, y); | |
int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); | |
__m128 tmp_x; | |
__m128 tmp_y; | |
for(int i = 0; i < GROUP_COUNT; ++i){ | |
if(i % 256 == 0) { | |
// _mm_prefetch(particles + i + 1, _MM_HINT_T0); | |
// _mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0); | |
} | |
tmp_x = particles[i].force_x; | |
tmp_y = particles[i].force_y; | |
particles[i].force_x = _mm_add_ps(tmp_x, force_x); | |
particles[i].force_y = _mm_add_ps(tmp_y, force_y); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment