Skip to content

Instantly share code, notes, and snippets.

@ChillFish8
Created July 3, 2024 08:47
Show Gist options
  • Save ChillFish8/c452d86e4953fe502276d95102d12582 to your computer and use it in GitHub Desktop.
Save ChillFish8/c452d86e4953fe502276d95102d12582 to your computer and use it in GitHub Desktop.
Iterations: 100
Instructions: 12600
Total Cycles: 10225
Total uOps: 22600
Dispatch Width: 6
uOps Per Cycle: 2.21
IPC: 1.23
Block RThroughput: 37.7
Cycles with backend pressure increase [ 16.59% ]
Throughput Bottlenecks:
Resource Pressure [ 10.69% ]
- Zn3ALU0 [ 0.96% ]
- Zn3ALU1 [ 0.96% ]
- Zn3ALU2 [ 0.96% ]
- Zn3ALU3 [ 0.96% ]
- Zn3FPP0 [ 0.99% ]
- Zn3FPP1 [ 0.99% ]
- Zn3FPP45 [ 9.73% ]
Data Dependencies: [ 15.62% ]
- Register Dependencies [ 15.62% ]
- Memory Dependencies [ 0.00% ]
Critical sequence based on the simulation:
Instruction Dependency Information
+----< 125. int3
|
| < loop carried >
|
| 0. sub rsp, 104
| 1. mov qword ptr [rsp + 40], rdx
| 2. mov qword ptr [rsp + 48], r9
| 3. cmp rdx, r9
| 4. jne .LBB4_16
| 5. mov rax, rdx
| 6. and rax, -8
| 7. je .LBB4_2
| 8. lea r10, [rax - 1]
| 9. shr r10, 3
| 10. inc r10
| 11. mov r9d, r10d
| 12. and r9d, 7
| 13. cmp rax, 57
| 14. jae .LBB4_14
| 15. vxorps xmm0, xmm0, xmm0
| 16. xor eax, eax
| 17. jmp .LBB4_5
| 18. xor eax, eax
| 19. vxorps xmm0, xmm0, xmm0
| 20. jmp .LBB4_7
| 21. and r10, -8
| 22. vxorps xmm0, xmm0, xmm0
| 23. xor eax, eax
| 24. vmovups ymm1, ymmword ptr [rcx + 4*rax]
| 25. vmovups ymm2, ymmword ptr [rcx + 4*rax + 32]
| 26. vmovups ymm3, ymmword ptr [rcx + 4*rax + 64]
| 27. vmovups ymm4, ymmword ptr [rcx + 4*rax + 96]
| 28. vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
| 29. vaddps ymm0, ymm0, ymm1
| 30. vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32]
| 31. vaddps ymm0, ymm0, ymm1
| 32. vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64]
| 33. vaddps ymm0, ymm0, ymm1
| 34. vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96]
| 35. vaddps ymm0, ymm0, ymm1
| 36. vmovups ymm1, ymmword ptr [rcx + 4*rax + 128]
| 37. vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128]
| 38. vaddps ymm0, ymm0, ymm1
| 39. vmovups ymm1, ymmword ptr [rcx + 4*rax + 160]
| 40. vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160]
| 41. vaddps ymm0, ymm0, ymm1
| 42. vmovups ymm1, ymmword ptr [rcx + 4*rax + 192]
| 43. vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192]
| 44. vaddps ymm0, ymm0, ymm1
| 45. vmovups ymm1, ymmword ptr [rcx + 4*rax + 224]
| 46. vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224]
| 47. add rax, 64
| 48. add r10, -8
| 49. vaddps ymm0, ymm0, ymm1
| 50. jne .LBB4_15
| 51. test r9, r9
| 52. je .LBB4_7
| 53. vmovups ymm1, ymmword ptr [rcx + 4*rax]
| 54. vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
| 55. add rax, 8
| 56. dec r9
| 57. vaddps ymm0, ymm0, ymm1
| 58. jne .LBB4_6
| 59. vextractf128 xmm1, ymm0, 1
| 60. mov r9, rax
| 61. sub r9, rdx
| 62. vaddps xmm0, xmm1, xmm0
| 63. vshufpd xmm1, xmm0, xmm0, 1
| 64. vaddps xmm0, xmm0, xmm1
| 65. vmovshdup xmm1, xmm0
| 66. vaddss xmm0, xmm0, xmm1
| 67. jae .LBB4_13
| 68. mov r10d, edx
| 69. sub r10d, eax
| 70. and r10d, 7
| 71. je .LBB4_10
| 72. vmovss xmm1, dword ptr [rcx + 4*rax]
| 73. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
| 74. inc rax
| 75. dec r10
| 76. vaddss xmm0, xmm0, xmm1
| 77. jne .LBB4_9
| 78. cmp r9, -8
| 79. ja .LBB4_13
+----> 80. neg rdx ## RESOURCE interference: Zn3ALU1 [ probability: 99% ]
81. add rax, 7
82. vmovss xmm1, dword ptr [rcx + 4*rax - 28]
83. vmovss xmm2, dword ptr [rcx + 4*rax - 24]
84. lea r9, [rdx + rax + 8]
85. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28]
86. vaddss xmm0, xmm0, xmm1
87. vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24]
88. vaddss xmm0, xmm0, xmm1
89. vmovss xmm1, dword ptr [rcx + 4*rax - 20]
90. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20]
91. vaddss xmm0, xmm0, xmm1
92. vmovss xmm1, dword ptr [rcx + 4*rax - 16]
93. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16]
94. vaddss xmm0, xmm0, xmm1
95. vmovss xmm1, dword ptr [rcx + 4*rax - 12]
96. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12]
97. vaddss xmm0, xmm0, xmm1
98. vmovss xmm1, dword ptr [rcx + 4*rax - 8]
99. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8]
100. vaddss xmm0, xmm0, xmm1
101. vmovss xmm1, dword ptr [rcx + 4*rax - 4]
102. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4]
103. vaddss xmm0, xmm0, xmm1
104. vmovss xmm1, dword ptr [rcx + 4*rax]
105. vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
106. add rax, 8
107. vaddss xmm0, xmm0, xmm1
108. cmp r9, 7
109. jne .LBB4_12
110. add rsp, 104
111. vzeroupper
112. ret
113. lea rax, [rip + __unnamed_2]
114. lea rcx, [rip + __unnamed_3]
115. lea r9, [rip + __unnamed_5]
116. lea rdx, [rsp + 48]
117. lea r8, [rsp + 56]
118. vxorps xmm0, xmm0, xmm0
119. mov qword ptr [rsp + 56], rax
120. mov qword ptr [rsp + 64], 1
121. mov qword ptr [rsp + 72], rcx
122. lea rcx, [rsp + 40]
123. vmovups xmmword ptr [rsp + 80], xmm0
124. call core::panicking::assert_failed
125. int3
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 sub rsp, 104
1 1 1.00 * mov qword ptr [rsp + 40], rdx
1 1 1.00 * mov qword ptr [rsp + 48], r9
1 1 0.25 cmp rdx, r9
1 1 0.50 jne .LBB4_16
1 0 0.17 mov rax, rdx
1 1 0.25 and rax, -8
1 1 0.50 je .LBB4_2
1 1 0.33 lea r10, [rax - 1]
1 1 0.50 shr r10, 3
1 1 0.25 inc r10
1 0 0.17 mov r9d, r10d
1 1 0.25 and r9d, 7
1 1 0.25 cmp rax, 57
1 1 0.50 jae .LBB4_14
1 0 0.17 vxorps xmm0, xmm0, xmm0
1 0 0.17 xor eax, eax
1 1 0.50 jmp .LBB4_5
1 0 0.17 xor eax, eax
1 0 0.17 vxorps xmm0, xmm0, xmm0
1 1 0.50 jmp .LBB4_7
1 1 0.25 and r10, -8
1 0 0.17 vxorps xmm0, xmm0, xmm0
1 0 0.17 xor eax, eax
1 8 0.50 * vmovups ymm1, ymmword ptr [rcx + 4*rax]
1 8 0.50 * vmovups ymm2, ymmword ptr [rcx + 4*rax + 32]
1 8 0.50 * vmovups ymm3, ymmword ptr [rcx + 4*rax + 64]
1 8 0.50 * vmovups ymm4, ymmword ptr [rcx + 4*rax + 96]
1 10 0.50 * vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 10 0.50 * vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 10 0.50 * vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 10 0.50 * vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 8 0.50 * vmovups ymm1, ymmword ptr [rcx + 4*rax + 128]
1 10 0.50 * vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 8 0.50 * vmovups ymm1, ymmword ptr [rcx + 4*rax + 160]
1 10 0.50 * vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 8 0.50 * vmovups ymm1, ymmword ptr [rcx + 4*rax + 192]
1 10 0.50 * vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192]
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 8 0.50 * vmovups ymm1, ymmword ptr [rcx + 4*rax + 224]
1 10 0.50 * vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224]
1 1 0.25 add rax, 64
1 1 0.25 add r10, -8
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 1 0.50 jne .LBB4_15
1 1 0.25 test r9, r9
1 1 0.50 je .LBB4_7
1 8 0.50 * vmovups ymm1, ymmword ptr [rcx + 4*rax]
1 10 0.50 * vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
1 1 0.25 add rax, 8
1 1 0.25 dec r9
1 3 0.50 vaddps ymm0, ymm0, ymm1
1 1 0.50 jne .LBB4_6
1 4 1.00 vextractf128 xmm1, ymm0, 1
1 0 0.17 mov r9, rax
1 1 0.25 sub r9, rdx
1 3 0.50 vaddps xmm0, xmm1, xmm0
1 1 0.50 vshufpd xmm1, xmm0, xmm0, 1
1 3 0.50 vaddps xmm0, xmm0, xmm1
1 1 0.50 vmovshdup xmm1, xmm0
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 1 0.50 jae .LBB4_13
1 0 0.17 mov r10d, edx
1 1 0.25 sub r10d, eax
1 1 0.25 and r10d, 7
1 1 0.50 je .LBB4_10
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
1 1 0.25 inc rax
1 1 0.25 dec r10
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 1 0.50 jne .LBB4_9
1 1 0.25 cmp r9, -8
1 1 0.50 ja .LBB4_13
1 1 0.25 neg rdx
1 1 0.25 add rax, 7
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax - 28]
1 8 0.50 * vmovss xmm2, dword ptr [rcx + 4*rax - 24]
2 2 0.25 lea r9, [rdx + rax + 8]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 10 0.50 * vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax - 20]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax - 16]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax - 12]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax - 8]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax - 4]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4]
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 8 0.50 * vmovss xmm1, dword ptr [rcx + 4*rax]
1 10 0.50 * vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
1 1 0.25 add rax, 8
1 3 0.50 vaddss xmm0, xmm0, xmm1
1 1 0.25 cmp r9, 7
1 1 0.50 jne .LBB4_12
1 1 0.25 add rsp, 104
1 0 0.25 U vzeroupper
1 5 0.50 U ret
1 1 0.33 lea rax, [rip + __unnamed_2]
1 1 0.33 lea rcx, [rip + __unnamed_3]
1 1 0.33 lea r9, [rip + __unnamed_5]
1 1 0.33 lea rdx, [rsp + 48]
1 1 0.33 lea r8, [rsp + 56]
1 0 0.17 vxorps xmm0, xmm0, xmm0
1 1 1.00 * mov qword ptr [rsp + 56], rax
1 1 1.00 * mov qword ptr [rsp + 64], 1
1 1 1.00 * mov qword ptr [rsp + 72], rcx
1 1 0.33 lea rcx, [rsp + 40]
1 1 1.00 * vmovups xmmword ptr [rsp + 80], xmm0
1 1 0.50 call core::panicking::assert_failed
100 100 25.00 * * U int3
Resources:
[0] - Zn3AGU0
[1] - Zn3AGU1
[2] - Zn3AGU2
[3] - Zn3ALU0
[4] - Zn3ALU1
[5] - Zn3ALU2
[6] - Zn3ALU3
[7] - Zn3BRU1
[8] - Zn3FPP0
[9] - Zn3FPP1
[10] - Zn3FPP2
[11] - Zn3FPP3
[12.0] - Zn3FPP45
[12.1] - Zn3FPP45
[13] - Zn3FPSt
[14.0] - Zn3LSU
[14.1] - Zn3LSU
[14.2] - Zn3LSU
[15.0] - Zn3Load
[15.1] - Zn3Load
[15.2] - Zn3Load
[16.0] - Zn3Store
[16.1] - Zn3Store
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
4.33 4.33 4.34 10.97 100.07 8.99 9.98 9.99 10.00 10.99 11.00 11.01 18.50 18.50 1.00 15.99 16.00 16.01 12.33 12.33 12.34 5.00 6.00
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
- - - - - - 1.00 - - - - - - - - - - - - - - - - sub rsp, 104
0.33 0.33 0.34 - - - - - - - - - - - - 0.66 0.66 0.68 - - - - 2.00 mov qword ptr [rsp + 40], rdx
0.33 0.34 0.33 - - - - - - - - - - - - 0.66 0.68 0.66 - - - 2.00 - mov qword ptr [rsp + 48], r9
- - - - - 1.00 - - - - - - - - - - - - - - - - - cmp rdx, r9
- - - - - - - 1.00 - - - - - - - - - - - - - - - jne .LBB4_16
- - - - - - - - - - - - - - - - - - - - - - - mov rax, rdx
- - - 0.99 0.01 - - - - - - - - - - - - - - - - - - and rax, -8
- - - 1.00 - - - - - - - - - - - - - - - - - - - je .LBB4_2
0.34 0.33 0.33 - - - - - - - - - - - - - - - - - - - - lea r10, [rax - 1]
- - - - - 1.00 - - - - - - - - - - - - - - - - - shr r10, 3
- - - - 0.01 - 0.99 - - - - - - - - - - - - - - - - inc r10
- - - - - - - - - - - - - - - - - - - - - - - mov r9d, r10d
- - - - - 0.99 0.01 - - - - - - - - - - - - - - - - and r9d, 7
- - - - - - 1.00 - - - - - - - - - - - - - - - - cmp rax, 57
- - - 0.01 - - - 0.99 - - - - - - - - - - - - - - - jae .LBB4_14
- - - - - - - - - - - - - - - - - - - - - - - vxorps xmm0, xmm0, xmm0
- - - - - - - - - - - - - - - - - - - - - - - xor eax, eax
- - - 0.99 - - - 0.01 - - - - - - - - - - - - - - - jmp .LBB4_5
- - - - - - - - - - - - - - - - - - - - - - - xor eax, eax
- - - - - - - - - - - - - - - - - - - - - - - vxorps xmm0, xmm0, xmm0
- - - - - - - 1.00 - - - - - - - - - - - - - - - jmp .LBB4_7
- - - 0.99 - 0.01 - - - - - - - - - - - - - - - - - and r10, -8
- - - - - - - - - - - - - - - - - - - - - - - vxorps xmm0, xmm0, xmm0
- - - - - - - - - - - - - - - - - - - - - - - xor eax, eax
- - - - - - - - - - - - 0.50 0.50 - 0.34 0.33 0.33 0.33 0.33 0.34 - - vmovups ymm1, ymmword ptr [rcx + 4*rax]
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.33 0.34 0.33 0.34 0.33 - - vmovups ymm2, ymmword ptr [rcx + 4*rax + 32]
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.34 0.33 0.34 0.33 0.33 - - vmovups ymm3, ymmword ptr [rcx + 4*rax + 64]
- - - - - - - - - - - - 0.50 0.50 - 0.34 0.33 0.33 0.33 0.33 0.34 - - vmovups ymm4, ymmword ptr [rcx + 4*rax + 96]
- - - - - - - - 0.98 0.02 - - 0.50 0.50 - 0.33 0.33 0.34 0.33 0.34 0.33 - - vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
- - - - - - - - - - 0.99 0.01 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - 0.01 0.99 - - 0.51 0.49 - 0.32 0.34 0.34 0.34 0.34 0.32 - - vmulps ymm1, ymm2, ymmword ptr [r8 + 4*rax + 32]
- - - - - - - - - - - 1.00 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - 0.99 0.01 - - 0.49 0.51 - 0.34 0.34 0.32 0.34 0.32 0.34 - - vmulps ymm1, ymm3, ymmword ptr [r8 + 4*rax + 64]
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - 0.01 0.99 - - 0.51 0.49 - 0.34 0.32 0.34 0.32 0.34 0.34 - - vmulps ymm1, ymm4, ymmword ptr [r8 + 4*rax + 96]
- - - - - - - - - - 0.98 0.02 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - - - - - 0.49 0.51 - 0.33 0.34 0.33 0.34 0.33 0.33 - - vmovups ymm1, ymmword ptr [rcx + 4*rax + 128]
- - - - - - - - 0.99 0.01 - - 0.49 0.51 - 0.33 0.32 0.35 0.32 0.35 0.33 - - vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 128]
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - - - - - 0.51 0.49 - 0.34 0.34 0.32 0.34 0.32 0.34 - - vmovups ymm1, ymmword ptr [rcx + 4*rax + 160]
- - - - - - - - 0.01 0.99 - - 0.51 0.49 - 0.32 0.34 0.34 0.34 0.34 0.32 - - vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 160]
- - - - - - - - - - 0.98 0.02 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - - - - - 0.49 0.51 - 0.35 0.33 0.32 0.33 0.32 0.35 - - vmovups ymm1, ymmword ptr [rcx + 4*rax + 192]
- - - - - - - - 0.99 0.01 - - 0.49 0.51 - 0.32 0.35 0.33 0.35 0.33 0.32 - - vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 192]
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - - - - - - 0.51 0.49 - 0.34 0.32 0.34 0.32 0.34 0.34 - - vmovups ymm1, ymmword ptr [rcx + 4*rax + 224]
- - - - - - - - 0.01 0.99 - - 0.51 0.49 - 0.34 0.34 0.32 0.34 0.32 0.34 - - vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax + 224]
- - - - 0.01 - 0.99 - - - - - - - - - - - - - - - - add rax, 64
- - - 0.01 - 0.99 - - - - - - - - - - - - - - - - - add r10, -8
- - - - - - - - - - 0.98 0.02 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - 1.00 - - - - - - - - - - - - - - - jne .LBB4_15
- - - 0.99 - - 0.01 - - - - - - - - - - - - - - - - test r9, r9
- - - 0.01 - - - 0.99 - - - - - - - - - - - - - - - je .LBB4_7
- - - - - - - - - - - - 0.49 0.51 - 0.33 0.32 0.35 0.32 0.35 0.33 - - vmovups ymm1, ymmword ptr [rcx + 4*rax]
- - - - - - - - 0.99 0.01 - - 0.49 0.51 - 0.35 0.33 0.32 0.33 0.32 0.35 - - vmulps ymm1, ymm1, ymmword ptr [r8 + 4*rax]
- - - - - 0.01 0.99 - - - - - - - - - - - - - - - - add rax, 8
- - - - 0.01 0.99 - - - - - - - - - - - - - - - - - dec r9
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddps ymm0, ymm0, ymm1
- - - - - - - 1.00 - - - - - - - - - - - - - - - jne .LBB4_6
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextractf128 xmm1, ymm0, 1
- - - - - - - - - - - - - - - - - - - - - - - mov r9, rax
- - - 0.99 - - 0.01 - - - - - - - - - - - - - - - - sub r9, rdx
- - - - - - - - - - 0.99 0.01 - - - - - - - - - - - vaddps xmm0, xmm1, xmm0
- - - - - - - - - 0.99 0.01 - - - - - - - - - - - - vshufpd xmm1, xmm0, xmm0, 1
- - - - - - - - - - - 1.00 - - - - - - - - - - - vaddps xmm0, xmm0, xmm1
- - - - - - - - - 0.99 0.01 - - - - - - - - - - - - vmovshdup xmm1, xmm0
- - - - - - - - - - 0.99 0.01 - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - 1.00 - - - - - - - - - - - - - - - - - - - jae .LBB4_13
- - - - - - - - - - - - - - - - - - - - - - - mov r10d, edx
- - - - - 0.01 0.99 - - - - - - - - - - - - - - - - sub r10d, eax
- - - - 0.01 0.99 - - - - - - - - - - - - - - - - - and r10d, 7
- - - - - - - 1.00 - - - - - - - - - - - - - - - je .LBB4_10
- - - - - - - - - - - - 0.51 0.49 - 0.32 0.34 0.34 0.34 0.34 0.32 - - vmovss xmm1, dword ptr [rcx + 4*rax]
- - - - - - - - 0.01 0.99 - - 0.50 0.50 - 0.33 0.33 0.34 0.33 0.34 0.33 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
- - - - - - 1.00 - - - - - - - - - - - - - - - - inc rax
- - - - - 0.01 0.99 - - - - - - - - - - - - - - - - dec r10
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - 1.00 - - - - - - - - - - - - - - - jne .LBB4_9
- - - 0.99 0.01 - - - - - - - - - - - - - - - - - - cmp r9, -8
- - - 1.00 - - - - - - - - - - - - - - - - - - - ja .LBB4_13
- - - - - - 1.00 - - - - - - - - - - - - - - - - neg rdx
- - - 0.01 - 0.99 - - - - - - - - - - - - - - - - - add rax, 7
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.34 0.33 0.34 0.33 0.33 - - vmovss xmm1, dword ptr [rcx + 4*rax - 28]
- - - - - - - - - - - - 0.50 0.50 - 0.34 0.33 0.33 0.33 0.33 0.34 - - vmovss xmm2, dword ptr [rcx + 4*rax - 24]
- - - - - 1.00 - - - - - - - - - - - - - - - - - lea r9, [rdx + rax + 8]
- - - - - - - - 0.99 0.01 - - 0.50 0.50 - 0.33 0.33 0.34 0.33 0.34 0.33 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 28]
- - - - - - - - - - 1.00 - - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - 0.01 0.99 - - 0.50 0.50 - 0.33 0.34 0.33 0.34 0.33 0.33 - - vmulss xmm1, xmm2, dword ptr [r8 + 4*rax - 24]
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.34 0.33 0.33 0.33 0.33 0.34 - - vmovss xmm1, dword ptr [rcx + 4*rax - 20]
- - - - - - - - 0.99 0.01 - - 0.50 0.50 - 0.33 0.34 0.33 0.34 0.33 0.33 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 20]
- - - - - - - - - - 1.00 - - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.33 0.34 0.33 0.34 0.33 - - vmovss xmm1, dword ptr [rcx + 4*rax - 16]
- - - - - - - - 0.01 0.99 - - 0.50 0.50 - 0.34 0.33 0.33 0.33 0.33 0.34 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 16]
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.33 0.34 0.33 0.34 0.33 - - vmovss xmm1, dword ptr [rcx + 4*rax - 12]
- - - - - - - - 0.99 0.01 - - 0.50 0.50 - 0.33 0.32 0.35 0.32 0.35 0.33 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 12]
- - - - - - - - - - 1.00 - - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.34 0.33 0.34 0.33 0.33 - - vmovss xmm1, dword ptr [rcx + 4*rax - 8]
- - - - - - - - 0.01 0.99 - - 0.50 0.50 - 0.32 0.35 0.33 0.35 0.33 0.32 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 8]
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.35 0.33 0.32 0.33 0.32 0.35 - - vmovss xmm1, dword ptr [rcx + 4*rax - 4]
- - - - - - - - 0.99 0.01 - - 0.50 0.50 - 0.32 0.35 0.33 0.35 0.33 0.32 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax - 4]
- - - - - - - - - - 1.00 - - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.33 0.32 0.35 0.32 0.35 0.33 - - vmovss xmm1, dword ptr [rcx + 4*rax]
- - - - - - - - 0.01 0.99 - - 0.50 0.50 - 0.35 0.33 0.32 0.33 0.32 0.35 - - vmulss xmm1, xmm1, dword ptr [r8 + 4*rax]
- - - 0.99 0.01 - - - - - - - - - - - - - - - - - - add rax, 8
- - - - - - - - - - - 1.00 - - - - - - - - - - - vaddss xmm0, xmm0, xmm1
- - - - - - 1.00 - - - - - - - - - - - - - - - - cmp r9, 7
- - - 0.01 - - - 0.99 - - - - - - - - - - - - - - - jne .LBB4_12
- - - - - 1.00 - - - - - - - - - - - - - - - - - add rsp, 104
- - - - - - - - 0.01 - 0.99 - - - - - - - - - - - - vzeroupper
0.33 0.33 0.34 0.99 - - - 0.01 - - - - - - - 0.34 0.33 0.33 0.33 0.33 0.34 - - ret
0.33 0.34 0.33 - - - - - - - - - - - - - - - - - - - - lea rax, [rip + __unnamed_2]
0.34 0.33 0.33 - - - - - - - - - - - - - - - - - - - - lea rcx, [rip + __unnamed_3]
0.33 0.33 0.34 - - - - - - - - - - - - - - - - - - - - lea r9, [rip + __unnamed_5]
0.33 0.34 0.33 - - - - - - - - - - - - - - - - - - - - lea rdx, [rsp + 48]
0.34 0.33 0.33 - - - - - - - - - - - - - - - - - - - - lea r8, [rsp + 56]
- - - - - - - - - - - - - - - - - - - - - - - vxorps xmm0, xmm0, xmm0
0.32 0.35 0.33 - - - - - - - - - - - - 0.66 0.66 0.68 - - - - 2.00 mov qword ptr [rsp + 56], rax
0.35 0.33 0.32 - - - - - - - - - - - - 0.66 0.68 0.66 - - - 2.00 - mov qword ptr [rsp + 64], 1
0.33 0.32 0.35 - - - - - - - - - - - - 0.68 0.66 0.66 - - - - 2.00 mov qword ptr [rsp + 72], rcx
0.33 0.33 0.34 - - - - - - - - - - - - - - - - - - - - lea rcx, [rsp + 40]
- - - - - - - - - - - - 0.50 0.50 1.00 0.33 0.33 0.34 - - - 1.00 - vmovups xmmword ptr [rsp + 80], xmm0
- - - - - - - 1.00 - - - - - - - - - - - - - - - call core::panicking::assert_failed
- - - - 100.00 - - - - - - - - - - - - - - - - - - int3
<stdin>:235:1: error: Unfinished frame!
^
<stdin>:235:1: error: Unfinished frame!
^
warning: found a return instruction in the input assembly sequence.
note: program counter updates are ignored.
warning: found a call in the input assembly sequence.
note: call instructions are not correctly modeled. Assume a latency of 100cy.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment