-
-
Save roxlu/3be5b919b495094a402f to your computer and use it in GitHub Desktop.
compiling with "-ggdb", running in gdb, hitting ctrl + c, type: disas /m
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 int main() { | |
0x0000000000406660 <+0>: push %r12 | |
0x000000000040666c <+12>: push %rbp | |
0x0000000000406675 <+21>: push %rbx | |
0x000000000040667b <+27>: sub $0x20,%rsp | |
40 posix_memalign((void**)&particles, 256, sizeof(particle_group) * GROUP_COUNT); | |
0x0000000000406662 <+2>: mov $0x249f00,%edx | |
0x0000000000406667 <+7>: mov $0x100,%esi | |
0x000000000040666d <+13>: mov $0x6238e0,%edi | |
0x0000000000406676 <+22>: mov $0xa,%ebx | |
0x000000000040667f <+31>: callq 0x405de0 <posix_memalign@plt> | |
0x0000000000406684 <+36>: vmovaps 0x15ea4(%rip),%xmm3 # 0x41c530 | |
0x000000000040668c <+44>: vmovaps 0x15eac(%rip),%xmm2 # 0x41c540 | |
41 | |
42 uint64_t total_time = 0; | |
0x0000000000406672 <+18>: xor %r12d,%r12d | |
0x00000000004066e3 <+131>: mov $0x61a8,%ecx | |
0x00000000004066e8 <+136>: nopl 0x0(%rax,%rax,1) | |
43 | |
44 for(int k = 0; k < NTESTS; ++k) { | |
0x000000000040679f <+319>: sub $0x1,%ebx | |
0x00000000004067a2 <+322>: vmovaps (%rsp),%xmm3 | |
0x00000000004067a7 <+327>: vmovaps 0x10(%rsp),%xmm2 | |
0x00000000004067ad <+333>: jne 0x406694 <main()+52> | |
45 | |
46 memset((char*)particles, 0x00, sizeof(particle_group) * GROUP_COUNT); | |
0x0000000000406694 <+52>: mov 0x21d245(%rip),%rdi # 0x6238e0 <particles> | |
0x000000000040669b <+59>: mov $0x249f00,%edx | |
0x00000000004066a0 <+64>: xor %esi,%esi | |
0x00000000004066a2 <+66>: vmovaps %xmm2,0x10(%rsp) | |
0x00000000004066a8 <+72>: vmovaps %xmm3,(%rsp) | |
0x00000000004066ad <+77>: callq 0x405da0 <memset@plt> | |
47 | |
48 uint64_t start = uv_hrtime(); | |
0x00000000004066b2 <+82>: callq 0x406abd <uv_hrtime> | |
0x00000000004066b7 <+87>: mov 0x21d222(%rip),%rdx # 0x6238e0 <particles> | |
0x00000000004066be <+94>: vmovaps (%rsp),%xmm3 | |
0x00000000004066c3 <+99>: mov %rax,%rbp | |
0x00000000004066c6 <+102>: mov $0xbb8,%eax | |
0x00000000004066cb <+107>: vmovaps 0x10(%rsp),%xmm2 | |
0x00000000004066d1 <+113>: lea 0x60(%rdx),%rdi | |
0x00000000004066d5 <+117>: lea 0x20(%rdx),%rsi | |
0x00000000004066d9 <+121>: nopl 0x0(%rax) | |
0x00000000004066e0 <+128>: mov %rsi,%rdx | |
49 for(int i = 0; i < NLOOPS; ++i) { | |
0x000000000040673e <+222>: sub $0x1,%eax | |
0x0000000000406741 <+225>: jne 0x4066e0 <main()+128> | |
0x0000000000406743 <+227>: vmovaps %xmm2,0x10(%rsp) | |
0x0000000000406749 <+233>: vmovaps %xmm3,(%rsp) | |
50 add_force(0.0005, 0.03); | |
51 step(); | |
52 } | |
53 uint64_t d = uv_hrtime() - start; | |
0x000000000040674e <+238>: callq 0x406abd <uv_hrtime> | |
54 printf("Took: %lld, millis: %f, millis per loop: %f\n", d, double(d)/1000000.0, (double(d)/1000000.0)/float(NLOOPS)); | |
0x0000000000406753 <+243>: vmovaps (%rsp),%xmm3 | |
0x0000000000406758 <+248>: sub %rbp,%rax | |
0x000000000040675b <+251>: vmovaps 0x10(%rsp),%xmm2 | |
0x0000000000406761 <+257>: mov %rax,%rbp | |
0x0000000000406764 <+260>: js 0x4067ef <main()+399> | |
0x000000000040676a <+266>: vcvtsi2sd %rax,%xmm0,%xmm0 | |
0x000000000040676f <+271>: mov %rbp,%rsi | |
0x0000000000406772 <+274>: mov $0x41c4e8,%edi | |
0x0000000000406777 <+279>: mov $0x2,%eax | |
0x000000000040677c <+284>: vmovaps %xmm3,(%rsp) | |
0x0000000000406781 <+289>: vdivsd 0x15dc7(%rip),%xmm0,%xmm0 # 0x41c550 | |
0x0000000000406792 <+306>: vdivsd 0x15dbe(%rip),%xmm0,%xmm1 # 0x41c558 | |
0x000000000040679a <+314>: callq 0x405d80 <printf@plt> | |
0x00000000004067ef <+399>: mov %rbp,%rdx | |
0x00000000004067f2 <+402>: shr %rax | |
0x00000000004067f5 <+405>: and $0x1,%edx | |
0x00000000004067f8 <+408>: or %rdx,%rax | |
0x00000000004067fb <+411>: vcvtsi2sd %rax,%xmm0,%xmm0 | |
0x0000000000406800 <+416>: vaddsd %xmm0,%xmm0,%xmm0 | |
0x0000000000406804 <+420>: jmpq 0x40676f <main()+271> | |
55 | |
56 total_time += d; | |
0x0000000000406789 <+297>: add %rbp,%r12 | |
0x000000000040678c <+300>: vmovaps %xmm2,0x10(%rsp) | |
57 } | |
58 | |
59 for(int i = 0; i < 10; ++i) { | |
60 int dx = i * 2; | |
61 // printf("force_x: %f, force_y: %f, pos_x: %f, pos_y: %f\n", forces[dx + 0], forces[dx + 1], positions[dx + 0], positions[dx + 1]); | |
62 } | |
63 | |
64 double avg = (double(total_time)/1000000.0)/double(NTESTS); | |
0x00000000004067b3 <+339>: test %r12,%r12 | |
0x00000000004067b6 <+342>: js 0x406809 <main()+425> | |
0x00000000004067b8 <+344>: vcvtsi2sd %r12,%xmm0,%xmm0 | |
0x00000000004067c7 <+359>: vdivsd 0x15d81(%rip),%xmm0,%xmm0 # 0x41c550 | |
0x00000000004067cf <+367>: vdivsd 0x15d89(%rip),%xmm0,%xmm0 # 0x41c560 | |
0x0000000000406809 <+425>: mov %r12,%rax | |
0x000000000040680c <+428>: and $0x1,%r12d | |
0x0000000000406810 <+432>: shr %rax | |
0x0000000000406813 <+435>: or %r12,%rax | |
0x0000000000406816 <+438>: vcvtsi2sd %rax,%xmm0,%xmm0 | |
0x000000000040681b <+443>: vaddsd %xmm0,%xmm0,%xmm0 | |
0x000000000040681f <+447>: jmp 0x4067bd <main()+349> | |
0x0000000000406821: data32 data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) | |
65 printf("Avarage: %f, ms: %f\n", avg, avg/float(NLOOPS)); | |
0x00000000004067bd <+349>: mov $0x41c518,%edi | |
0x00000000004067c2 <+354>: mov $0x2,%eax | |
0x00000000004067d7 <+375>: vdivsd 0x15d79(%rip),%xmm0,%xmm1 # 0x41c558 | |
0x00000000004067df <+383>: callq 0x405d80 <printf@plt> | |
66 } | |
0x00000000004067e4 <+388>: add $0x20,%rsp | |
0x00000000004067e8 <+392>: xor %eax,%eax | |
0x00000000004067ea <+394>: pop %rbx | |
0x00000000004067eb <+395>: pop %rbp | |
0x00000000004067ec <+396>: pop %r12 | |
0x00000000004067ee <+398>: retq | |
67 | |
68 // ---------------------------------------------------------------------- | |
69 /* | |
70 prefecth heuristics: | |
71 _mm_prefetch is a hint to fill the cache | |
72 | |
73 */ | |
74 | |
75 void add_force(float x, float y) { | |
76 __m128 force_x = _mm_set_ps(x, x, x, x); | |
77 __m128 force_y = _mm_set_ps(y, y, y, y); | |
78 | |
79 int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); | |
80 __m128 tmp_x; | |
81 __m128 tmp_y; | |
82 for(int i = 0; i < GROUP_COUNT; ++i){ | |
0x0000000000406707 <+167>: sub $0x1,%ecx | |
0x000000000040670a <+170>: jne 0x4066f0 <main()+144> | |
0x000000000040670c <+172>: mov %rdi,%rcx | |
0x000000000040670f <+175>: xor %edx,%edx | |
0x0000000000406711 <+177>: jmp 0x406727 <main()+199> | |
0x0000000000406713 <+179>: nopl 0x0(%rax,%rax,1) | |
83 | |
84 if(i % 256 == 0) { | |
85 // _mm_prefetch(particles + i + 1, _MM_HINT_T0); | |
86 // _mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0); | |
87 } | |
88 tmp_x = particles[i].force_x; | |
89 tmp_y = particles[i].force_y; | |
90 | |
91 particles[i].force_x = _mm_add_ps(tmp_x, force_x); | |
0x00000000004066f8 <+152>: vmovaps %xmm0,-0x60(%rdx) | |
92 particles[i].force_y = _mm_add_ps(tmp_y, force_y); | |
0x0000000000406702 <+162>: vmovaps %xmm0,-0x50(%rdx) | |
93 | |
94 } | |
95 } | |
96 | |
97 void step() { | |
98 __m128 const null_force = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); | |
99 __m128 const drag = _mm_set_ps(0.99f, 0.99f, 0.99f, 0.99f); | |
100 __m128 pos_x; | |
101 __m128 pos_y; | |
102 __m128 vel_x; | |
103 __m128 vel_y; | |
104 int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0])); | |
105 uint32_t i = 0; | |
106 uint32_t total = GROUP_COUNT; | |
107 for (i = 0; i < total; ++i){ | |
0x0000000000406718 <+184>: add $0x1,%edx | |
0x000000000040671b <+187>: add $0x60,%rcx | |
0x000000000040671f <+191>: cmp $0x61a8,%edx | |
0x0000000000406725 <+197>: je 0x40673e <main()+222> | |
0x000000000040672c <+204>: add $0x1,%edx | |
0x0000000000406736 <+214>: cmp $0x61a8,%edx | |
0x000000000040673c <+220>: jne 0x406727 <main()+199> | |
108 | |
109 if( (i % 2) == 0) { | |
0x0000000000406727 <+199>: test $0x1,%dl | |
0x000000000040672a <+202>: jne 0x406718 <main()+184> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment