-
-
Save roxlu/1ab889f47b487ae51aad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0.00 │ nop ▒ | |
│ /* Perform the respective operation on the four SPFP values in A and B. */ ▒ | |
│ ▒ | |
│ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) ▒ | |
│ _mm_add_ps (__m128 __A, __m128 __B) ▒ | |
│ { ▒ | |
│ return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); ▒ | |
2.54 │ 80:┌─→vaddps (%rdx),%xmm3,%xmm0 ▒ | |
│ │ __m128 force_y = _mm_set_ps(y, y, y, y); ▒ | |
│ │ ▒ | |
│ │ int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); ▒ | |
│ │ __m128 tmp_x; ▒ | |
│ │ __m128 tmp_y; ▒ | |
│ │ for(int i = 0; i < GROUP_COUNT; ++i){ ▒ | |
38.43 │ │ add $0x1,%ecx ▒ | |
2.01 │ │ add $0x60,%rdx ▒ | |
│ │ // _mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0); ▒ | |
│ │ } ▒ | |
│ │ tmp_x = particles[i].force_x; ▒ | |
│ │ tmp_y = particles[i].force_y; ▒ | |
│ │ ▒ | |
│ │ particles[i].force_x = _mm_add_ps(tmp_x, force_x); ▒ | |
0.00 │ │ vmovap %xmm0,-0x60(%rdx) ▒ | |
4.02 │ │ vaddps -0x50(%rdx),%xmm2,%xmm0 ▒ | |
│ │ particles[i].force_y = _mm_add_ps(tmp_y, force_y); ▒ | |
3.62 │ │ vmovap %xmm0,-0x50(%rdx) ◆ | |
│ │ __m128 force_y = _mm_set_ps(y, y, y, y); ▒ | |
│ │ ▒ | |
│ │ int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); ▒ | |
│ │ __m128 tmp_x; ▒ | |
│ │ __m128 tmp_y; ▒ | |
│ │ for(int i = 0; i < GROUP_COUNT; ++i){ ▒ | |
3.56 │ │ cmp $0x61a8,%ecx ▒ | |
│ └──jne 80 ▒ | |
│ mov %rsi,%rdx ▒ | |
│ mov $0x30d4,%cx ▒ | |
│ nop ▒ | |
│ for (i = 0; i < total; i = i + 2){ ▒ | |
│ // _mm_prefetch(particles + i , _MM_HINT_T0); ▒ | |
│ pos_x = particles[i].pos_x; ▒ | |
│ pos_y = particles[i].pos_y; ▒ | |
│ vel_x = particles[i].vel_x * particles[i].force_x; ▒ | |
│ vel_y = particles[i].vel_y * particles[i].force_y; ▒ | |
0.20 │ b0: vmovap 0x50(%rdx),%xmm0 ▒ | |
9.40 │ add $0xc0,%rdx ▒ | |
│ ▒ | |
│ for (i = 0; i < total; i = i + 2){ ▒ | |
│ // _mm_prefetch(particles + i , _MM_HINT_T0); ▒ | |
│ pos_x = particles[i].pos_x; ▒ | |
│ pos_y = particles[i].pos_y; ▒ | |
│ vel_x = particles[i].vel_x * particles[i].force_x; ▒ | |
0.64 │ vmovap -0x80(%rdx),%xmm1 ▒ | |
│ vel_y = particles[i].vel_y * particles[i].force_y; ▒ | |
2.55 │ vmulps -0x90(%rdx),%xmm0,%xmm0 ▒ | |
│ ▒ | |
│ for (i = 0; i < total; i = i + 2){ ▒ | |
│ // _mm_prefetch(particles + i , _MM_HINT_T0); ▒ | |
│ pos_x = particles[i].pos_x; ▒ | |
│ pos_y = particles[i].pos_y; ▒ | |
│ vel_x = particles[i].vel_x * particles[i].force_x; ▒ | |
19.04 │ vmulps -0xa0(%rdx),%xmm1,%xmm1 ▒ | |
│ vel_y = particles[i].vel_y * particles[i].force_y; ▒ | |
│ particles[i].pos_x = pos_x + vel_x; ▒ | |
│ particles[i].pos_y = pos_y + vel_y; ▒ | |
3.77 │ vaddps -0xb0(%rdx),%xmm0,%xmm0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment