Skip to content

Instantly share code, notes, and snippets.

@roxlu
Created October 30, 2013 22:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save roxlu/1ab889f47b487ae51aad to your computer and use it in GitHub Desktop.
Save roxlu/1ab889f47b487ae51aad to your computer and use it in GitHub Desktop.
0.00 │ nop ▒
│ /* Perform the respective operation on the four SPFP values in A and B. */ ▒
│ ▒
│ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) ▒
│ _mm_add_ps (__m128 __A, __m128 __B) ▒
│ { ▒
│ return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); ▒
2.54 │ 80:┌─→vaddps (%rdx),%xmm3,%xmm0 ▒
│ │ __m128 force_y = _mm_set_ps(y, y, y, y); ▒
│ │ ▒
│ │ int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); ▒
│ │ __m128 tmp_x; ▒
│ │ __m128 tmp_y; ▒
│ │ for(int i = 0; i < GROUP_COUNT; ++i){ ▒
38.43 │ │ add $0x1,%ecx ▒
2.01 │ │ add $0x60,%rdx ▒
│ │ // _mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0); ▒
│ │ } ▒
│ │ tmp_x = particles[i].force_x; ▒
│ │ tmp_y = particles[i].force_y; ▒
│ │ ▒
│ │ particles[i].force_x = _mm_add_ps(tmp_x, force_x); ▒
0.00 │ │ vmovap %xmm0,-0x60(%rdx) ▒
4.02 │ │ vaddps -0x50(%rdx),%xmm2,%xmm0 ▒
│ │ particles[i].force_y = _mm_add_ps(tmp_y, force_y); ▒
3.62 │ │ vmovap %xmm0,-0x50(%rdx) ◆
│ │ __m128 force_y = _mm_set_ps(y, y, y, y); ▒
│ │ ▒
│ │ int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); ▒
│ │ __m128 tmp_x; ▒
│ │ __m128 tmp_y; ▒
│ │ for(int i = 0; i < GROUP_COUNT; ++i){ ▒
3.56 │ │ cmp $0x61a8,%ecx ▒
│ └──jne 80 ▒
│ mov %rsi,%rdx ▒
│ mov $0x30d4,%cx ▒
│ nop ▒
│ for (i = 0; i < total; i = i + 2){ ▒
│ // _mm_prefetch(particles + i , _MM_HINT_T0); ▒
│ pos_x = particles[i].pos_x; ▒
│ pos_y = particles[i].pos_y; ▒
│ vel_x = particles[i].vel_x * particles[i].force_x; ▒
│ vel_y = particles[i].vel_y * particles[i].force_y; ▒
0.20 │ b0: vmovap 0x50(%rdx),%xmm0 ▒
9.40 │ add $0xc0,%rdx ▒
│ ▒
│ for (i = 0; i < total; i = i + 2){ ▒
│ // _mm_prefetch(particles + i , _MM_HINT_T0); ▒
│ pos_x = particles[i].pos_x; ▒
│ pos_y = particles[i].pos_y; ▒
│ vel_x = particles[i].vel_x * particles[i].force_x; ▒
0.64 │ vmovap -0x80(%rdx),%xmm1 ▒
│ vel_y = particles[i].vel_y * particles[i].force_y; ▒
2.55 │ vmulps -0x90(%rdx),%xmm0,%xmm0 ▒
│ ▒
│ for (i = 0; i < total; i = i + 2){ ▒
│ // _mm_prefetch(particles + i , _MM_HINT_T0); ▒
│ pos_x = particles[i].pos_x; ▒
│ pos_y = particles[i].pos_y; ▒
│ vel_x = particles[i].vel_x * particles[i].force_x; ▒
19.04 │ vmulps -0xa0(%rdx),%xmm1,%xmm1 ▒
│ vel_y = particles[i].vel_y * particles[i].force_y; ▒
│ particles[i].pos_x = pos_x + vel_x; ▒
│ particles[i].pos_y = pos_y + vel_y; ▒
3.77 │ vaddps -0xb0(%rdx),%xmm0,%xmm0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment