Skip to content

Instantly share code, notes, and snippets.

@roxlu
Last active December 27, 2015 00:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save roxlu/3be5b919b495094a402f to your computer and use it in GitHub Desktop.
Save roxlu/3be5b919b495094a402f to your computer and use it in GitHub Desktop.
compiling with "-ggdb", running in gdb, hitting ctrl + c, type: disas /m
39 int main() {
0x0000000000406660 <+0>: push %r12
0x000000000040666c <+12>: push %rbp
0x0000000000406675 <+21>: push %rbx
0x000000000040667b <+27>: sub $0x20,%rsp
40 posix_memalign((void**)&particles, 256, sizeof(particle_group) * GROUP_COUNT);
0x0000000000406662 <+2>: mov $0x249f00,%edx
0x0000000000406667 <+7>: mov $0x100,%esi
0x000000000040666d <+13>: mov $0x6238e0,%edi
0x0000000000406676 <+22>: mov $0xa,%ebx
0x000000000040667f <+31>: callq 0x405de0 <posix_memalign@plt>
0x0000000000406684 <+36>: vmovaps 0x15ea4(%rip),%xmm3 # 0x41c530
0x000000000040668c <+44>: vmovaps 0x15eac(%rip),%xmm2 # 0x41c540
41
42 uint64_t total_time = 0;
0x0000000000406672 <+18>: xor %r12d,%r12d
0x00000000004066e3 <+131>: mov $0x61a8,%ecx
0x00000000004066e8 <+136>: nopl 0x0(%rax,%rax,1)
43
44 for(int k = 0; k < NTESTS; ++k) {
0x000000000040679f <+319>: sub $0x1,%ebx
0x00000000004067a2 <+322>: vmovaps (%rsp),%xmm3
0x00000000004067a7 <+327>: vmovaps 0x10(%rsp),%xmm2
0x00000000004067ad <+333>: jne 0x406694 <main()+52>
45
46 memset((char*)particles, 0x00, sizeof(particle_group) * GROUP_COUNT);
0x0000000000406694 <+52>: mov 0x21d245(%rip),%rdi # 0x6238e0 <particles>
0x000000000040669b <+59>: mov $0x249f00,%edx
0x00000000004066a0 <+64>: xor %esi,%esi
0x00000000004066a2 <+66>: vmovaps %xmm2,0x10(%rsp)
0x00000000004066a8 <+72>: vmovaps %xmm3,(%rsp)
0x00000000004066ad <+77>: callq 0x405da0 <memset@plt>
47
48 uint64_t start = uv_hrtime();
0x00000000004066b2 <+82>: callq 0x406abd <uv_hrtime>
0x00000000004066b7 <+87>: mov 0x21d222(%rip),%rdx # 0x6238e0 <particles>
0x00000000004066be <+94>: vmovaps (%rsp),%xmm3
0x00000000004066c3 <+99>: mov %rax,%rbp
0x00000000004066c6 <+102>: mov $0xbb8,%eax
0x00000000004066cb <+107>: vmovaps 0x10(%rsp),%xmm2
0x00000000004066d1 <+113>: lea 0x60(%rdx),%rdi
0x00000000004066d5 <+117>: lea 0x20(%rdx),%rsi
0x00000000004066d9 <+121>: nopl 0x0(%rax)
0x00000000004066e0 <+128>: mov %rsi,%rdx
49 for(int i = 0; i < NLOOPS; ++i) {
0x000000000040673e <+222>: sub $0x1,%eax
0x0000000000406741 <+225>: jne 0x4066e0 <main()+128>
0x0000000000406743 <+227>: vmovaps %xmm2,0x10(%rsp)
0x0000000000406749 <+233>: vmovaps %xmm3,(%rsp)
50 add_force(0.0005, 0.03);
51 step();
52 }
53 uint64_t d = uv_hrtime() - start;
0x000000000040674e <+238>: callq 0x406abd <uv_hrtime>
54 printf("Took: %lld, millis: %f, millis per loop: %f\n", d, double(d)/1000000.0, (double(d)/1000000.0)/float(NLOOPS));
0x0000000000406753 <+243>: vmovaps (%rsp),%xmm3
0x0000000000406758 <+248>: sub %rbp,%rax
0x000000000040675b <+251>: vmovaps 0x10(%rsp),%xmm2
0x0000000000406761 <+257>: mov %rax,%rbp
0x0000000000406764 <+260>: js 0x4067ef <main()+399>
0x000000000040676a <+266>: vcvtsi2sd %rax,%xmm0,%xmm0
0x000000000040676f <+271>: mov %rbp,%rsi
0x0000000000406772 <+274>: mov $0x41c4e8,%edi
0x0000000000406777 <+279>: mov $0x2,%eax
0x000000000040677c <+284>: vmovaps %xmm3,(%rsp)
0x0000000000406781 <+289>: vdivsd 0x15dc7(%rip),%xmm0,%xmm0 # 0x41c550
0x0000000000406792 <+306>: vdivsd 0x15dbe(%rip),%xmm0,%xmm1 # 0x41c558
0x000000000040679a <+314>: callq 0x405d80 <printf@plt>
0x00000000004067ef <+399>: mov %rbp,%rdx
0x00000000004067f2 <+402>: shr %rax
0x00000000004067f5 <+405>: and $0x1,%edx
0x00000000004067f8 <+408>: or %rdx,%rax
0x00000000004067fb <+411>: vcvtsi2sd %rax,%xmm0,%xmm0
0x0000000000406800 <+416>: vaddsd %xmm0,%xmm0,%xmm0
0x0000000000406804 <+420>: jmpq 0x40676f <main()+271>
55
56 total_time += d;
0x0000000000406789 <+297>: add %rbp,%r12
0x000000000040678c <+300>: vmovaps %xmm2,0x10(%rsp)
57 }
58
59 for(int i = 0; i < 10; ++i) {
60 int dx = i * 2;
61 // printf("force_x: %f, force_y: %f, pos_x: %f, pos_y: %f\n", forces[dx + 0], forces[dx + 1], positions[dx + 0], positions[dx + 1]);
62 }
63
64 double avg = (double(total_time)/1000000.0)/double(NTESTS);
0x00000000004067b3 <+339>: test %r12,%r12
0x00000000004067b6 <+342>: js 0x406809 <main()+425>
0x00000000004067b8 <+344>: vcvtsi2sd %r12,%xmm0,%xmm0
0x00000000004067c7 <+359>: vdivsd 0x15d81(%rip),%xmm0,%xmm0 # 0x41c550
0x00000000004067cf <+367>: vdivsd 0x15d89(%rip),%xmm0,%xmm0 # 0x41c560
0x0000000000406809 <+425>: mov %r12,%rax
0x000000000040680c <+428>: and $0x1,%r12d
0x0000000000406810 <+432>: shr %rax
0x0000000000406813 <+435>: or %r12,%rax
0x0000000000406816 <+438>: vcvtsi2sd %rax,%xmm0,%xmm0
0x000000000040681b <+443>: vaddsd %xmm0,%xmm0,%xmm0
0x000000000040681f <+447>: jmp 0x4067bd <main()+349>
0x0000000000406821: data32 data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
65 printf("Avarage: %f, ms: %f\n", avg, avg/float(NLOOPS));
0x00000000004067bd <+349>: mov $0x41c518,%edi
0x00000000004067c2 <+354>: mov $0x2,%eax
0x00000000004067d7 <+375>: vdivsd 0x15d79(%rip),%xmm0,%xmm1 # 0x41c558
0x00000000004067df <+383>: callq 0x405d80 <printf@plt>
66 }
0x00000000004067e4 <+388>: add $0x20,%rsp
0x00000000004067e8 <+392>: xor %eax,%eax
0x00000000004067ea <+394>: pop %rbx
0x00000000004067eb <+395>: pop %rbp
0x00000000004067ec <+396>: pop %r12
0x00000000004067ee <+398>: retq
67
68 // ----------------------------------------------------------------------
69 /*
70 prefecth heuristics:
71 _mm_prefetch is a hint to fill the cache
72
73 */
74
75 void add_force(float x, float y) {
76 __m128 force_x = _mm_set_ps(x, x, x, x);
77 __m128 force_y = _mm_set_ps(y, y, y, y);
78
79 int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0]));
80 __m128 tmp_x;
81 __m128 tmp_y;
82 for(int i = 0; i < GROUP_COUNT; ++i){
0x0000000000406707 <+167>: sub $0x1,%ecx
0x000000000040670a <+170>: jne 0x4066f0 <main()+144>
0x000000000040670c <+172>: mov %rdi,%rcx
0x000000000040670f <+175>: xor %edx,%edx
0x0000000000406711 <+177>: jmp 0x406727 <main()+199>
0x0000000000406713 <+179>: nopl 0x0(%rax,%rax,1)
83
84 if(i % 256 == 0) {
85 // _mm_prefetch(particles + i + 1, _MM_HINT_T0);
86 // _mm_prefetch(particles + i + prefetch_count, _MM_HINT_T0);
87 }
88 tmp_x = particles[i].force_x;
89 tmp_y = particles[i].force_y;
90
91 particles[i].force_x = _mm_add_ps(tmp_x, force_x);
0x00000000004066f8 <+152>: vmovaps %xmm0,-0x60(%rdx)
92 particles[i].force_y = _mm_add_ps(tmp_y, force_y);
0x0000000000406702 <+162>: vmovaps %xmm0,-0x50(%rdx)
93
94 }
95 }
96
97 void step() {
98 __m128 const null_force = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
99 __m128 const drag = _mm_set_ps(0.99f, 0.99f, 0.99f, 0.99f);
100 __m128 pos_x;
101 __m128 pos_y;
102 __m128 vel_x;
103 __m128 vel_y;
104 int const prefetch_count = power_of_two_below(CACHE_LINE_SIZE / sizeof(particles[0]));
105 uint32_t i = 0;
106 uint32_t total = GROUP_COUNT;
107 for (i = 0; i < total; ++i){
0x0000000000406718 <+184>: add $0x1,%edx
0x000000000040671b <+187>: add $0x60,%rcx
0x000000000040671f <+191>: cmp $0x61a8,%edx
0x0000000000406725 <+197>: je 0x40673e <main()+222>
0x000000000040672c <+204>: add $0x1,%edx
0x0000000000406736 <+214>: cmp $0x61a8,%edx
0x000000000040673c <+220>: jne 0x406727 <main()+199>
108
109 if( (i % 2) == 0) {
0x0000000000406727 <+199>: test $0x1,%dl
0x000000000040672a <+202>: jne 0x406718 <main()+184>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment