============= LATENCY ==============================================================================
instruction | IPC ( rel[%]), CPI ( rel[%])
------------------------------------------+---------------------------------------------------------
m128 addps | 0.50-0.25 ( 100.0[%]), 2.00-4.00 ( -50.0[%])
m128 aesdec | 0.33-0.14 ( 133.4[%]), 3.00-7.00 ( -57.1[%])
m128 aesdeclast | 0.33-0.14 ( 133.4[%]), 3.00-7.00 ( -57.1[%])
m128 aesenc | 0.33-0.14 ( 133.3[%]), 3.00-7.00 ( -57.1[%])
m128 aesenclast | 0.33-0.14 ( 133.4[%]), 3.00-7.00 ( -57.1[%])
m128 blendps | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m128 blendvps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 cvtps2dq | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 divpd | 0.08-0.08 ( 0.0[%]), 13.00-13.00 ( -0.0[%])
m128 divps | 0.09-0.09 ( 0.0[%]), 11.00-11.00 ( -0.0[%])
m128 dpps | 0.07-0.07 ( -0.0[%]), 14.00-14.00 ( 0.0[%])
m128 haddps | 0.48-0.48 ( 0.5[%]), 2.07-2.08 ( -0.5[%])
m128 loadps->movq | 0.12-0.12 ( 0.0[%]), 8.00-8.00 ( -0.0[%])
m128 movaps [mem] | 0.12-0.12 ( 0.0[%]), 8.00-8.00 ( -0.0[%])
m128 movdqu [mem+1] | 0.12-0.12 ( 0.0[%]), 8.00-8.00 ( -0.0[%])
m128 movdqu [mem+2MB-1] (cross page) | 0.07-0.07 ( 0.0[%]), 15.00-15.00 ( -0.0[%])
m128 movdqu [mem+63] (cross cache) | 0.07-0.07 ( -6.7[%]), 15.00-14.00 ( 7.1[%])
m128 movq->movq | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 mulps | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 padd | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m128 pclmulqdq | 0.33-0.17 ( 100.0[%]), 3.00-6.00 ( -50.0[%])
m128 pcmpestri->movq | 0.08-0.08 ( 0.0[%]), 13.00-13.00 ( -0.0[%])
m128 pcmpestrm | 0.09-0.09 ( 0.8[%]), 10.71-10.79 ( -0.8[%])
m128 pcmpistri->movq | 0.08-0.08 ( 0.0[%]), 12.00-12.00 ( -0.0[%])
m128 pcmpistrm | 0.11-0.11 ( 1.3[%]), 8.79-8.91 ( -1.3[%])
m128 phaddd | 0.48-0.48 ( 0.5[%]), 2.07-2.08 ( -0.5[%])
m128 pinsrd | 0.95-0.92 ( 3.2[%]), 1.05-1.09 ( -3.1[%])
m128 pinsrd->pextr | 0.17-0.17 ( -0.0[%]), 6.00-6.00 ( 0.0[%])
m128 pmovmskb->movq | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 pmuldq | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m128 pmullw | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m128 pshufb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 pxor | 5.54-4.50 ( 23.1[%]), 0.18-0.22 ( -18.7[%])
m128 rcpps | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 rsqrtps | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 shufps | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m128 vfmapd | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 vfmaps | 0.25-0.25 ( 0.1[%]), 4.00-4.00 ( -0.1[%])
m128 xorps | 5.73-4.50 ( 27.4[%]), 0.17-0.22 ( -21.5[%])
m256 gather32(<ld+ins>x8 + perm) | 0.06-0.06 ( 0.6[%]), 16.27-16.36 ( -0.6[%])
m256 gather64(<ld+ins>x4 + perm) | 0.08-0.08 ( 0.2[%]), 12.43-12.45 ( -0.2[%])
m256 movaps [mem] -> movq | 0.11-0.11 ( 0.1[%]), 9.00-9.01 ( -0.1[%])
m256 vaddps | 0.50-0.25 ( 100.0[%]), 2.00-4.00 ( -50.0[%])
m256 vdivpd | 0.08-0.08 ( 0.0[%]), 13.00-13.00 ( -0.0[%])
m256 vdivps | 0.09-0.09 ( 0.1[%]), 11.00-11.01 ( -0.1[%])
m256 vfmapd | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m256 vfmaps | 0.25-0.25 ( -0.0[%]), 4.00-4.00 ( 0.0[%])
m256 vgatherdpd | 0.04-0.05 ( -13.0[%]), 23.00-20.01 ( 14.9[%])
m256 vmovdqu [mem+1] -> movq | 0.11-0.11 ( 0.0[%]), 9.00-9.00 ( -0.0[%])
m256 vmovdqu [mem+2MB-1] (cross page) -> movq | 0.06-0.06 ( 0.0[%]), 16.00-16.01 ( -0.0[%])
m256 vmovdqu [mem+63] (cross cache) -> movq | 0.06-0.07 ( -12.5[%]), 16.00-14.00 ( 14.3[%])
m256 vmulps | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m256 vpaddd | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpblendvb | 0.33-0.50 ( -33.3[%]), 3.00-2.00 ( 50.0[%])
m256 vpdpwssd | N/A-0.20 ( N/A[%]), N/A-5.00 ( N/A[%])
m256 vpdpwssds | N/A-0.20 ( N/A[%]), N/A-5.00 ( N/A[%])
m256 vperm2f128 | 0.33-0.33 ( -0.0[%]), 3.00-3.00 ( 0.0[%])
m256 vpermpd | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vpermps | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vpgatherdd | 0.04-0.05 ( -8.3[%]), 24.00-22.00 ( 9.1[%])
m256 vpmovsxwd | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vpshufb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vpxor | 5.84-4.50 ( 29.7[%]), 0.17-0.22 ( -22.9[%])
m256 vrcpps | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m256 vrsqrtps | 0.25-0.25 ( 0.2[%]), 4.00-4.01 ( -0.2[%])
m256 vsqrtps | 0.08-0.08 ( 0.0[%]), 12.00-12.00 ( -0.0[%])
m256 vxorps | 5.84-4.50 ( 29.7[%]), 0.17-0.22 ( -22.9[%])
m512 vaddpd | N/A-0.25 ( N/A[%]), N/A-4.00 ( N/A[%])
m512 vaddps | N/A-0.25 ( N/A[%]), N/A-4.00 ( N/A[%])
m512 vfmapd | N/A-0.25 ( N/A[%]), N/A-4.00 ( N/A[%])
m512 vfmaps | N/A-0.25 ( N/A[%]), N/A-4.01 ( N/A[%])
m512 vfmaps reg, reg, [mem] | N/A-0.25 ( N/A[%]), N/A-4.00 ( N/A[%])
m512 vorpd | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vorps | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vorps reg, reg, [mem] | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vpconflictd | N/A-0.04 ( N/A[%]), N/A-26.44 ( N/A[%])
m512 vpdpwssd | N/A-0.20 ( N/A[%]), N/A-5.00 ( N/A[%])
m512 vpdpwssds | N/A-0.20 ( N/A[%]), N/A-5.00 ( N/A[%])
m512 vpermt2d | N/A-0.33 ( N/A[%]), N/A-3.00 ( N/A[%])
m512 vpexpandd | N/A-0.33 ( N/A[%]), N/A-3.00 ( N/A[%])
m512 vplzcntq | N/A-0.33 ( N/A[%]), N/A-3.00 ( N/A[%])
m512 vpternlogd | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vrcp14pd | N/A-0.17 ( N/A[%]), N/A-6.00 ( N/A[%])
m512 vshufps | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
reg64 add | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
reg64 crc32 | 0.33-0.33 ( 0.1[%]), 3.00-3.00 ( -0.1[%])
reg64 lea | 5.90-1.00 ( 490.0[%]), 0.17-1.00 ( -83.1[%])
reg64 load | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
reg64 popcnt | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
reg64 store [mem+0]->load[mem+0] | 0.13-0.13 ( 1.0[%]), 7.43-7.50 ( -1.0[%])
reg64 store [mem+0]->load[mem+1] | 0.05-0.05 ( 9.8[%]), 20.04-22.00 ( -8.9[%])
reg64 xor | 5.90-4.92 ( 19.9[%]), 0.17-0.20 ( -16.6[%])
reg64 xor dst,dst | 5.84-4.92 ( 18.7[%]), 0.17-0.20 ( -15.8[%])
============= THROUGHPUT ===========================================================================
instruction | IPC ( rel[%]), CPI ( rel[%])
------------------------------------------+---------------------------------------------------------
m128 addps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 aesdec | 2.00-1.00 ( 100.1[%]), 0.50-1.00 ( -50.0[%])
m128 aesdeclast | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m128 aesenc | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m128 aesenclast | 2.00-1.00 ( 99.9[%]), 0.50-1.00 ( -50.0[%])
m128 blendps | 3.00-3.00 ( 0.0[%]), 0.33-0.33 ( -0.0[%])
m128 blendvps | 2.77-2.40 ( 15.4[%]), 0.36-0.42 ( -13.3[%])
m128 cvtps2dq | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 divpd | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 divps | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m128 dpps | 0.25-0.25 ( 0.0[%]), 4.03-4.03 ( -0.0[%])
m128 haddps | 0.95-0.93 ( 1.7[%]), 1.05-1.07 ( -1.6[%])
m128 loadps | 1.99-2.00 ( -0.6[%]), 0.50-0.50 ( 0.6[%])
m128 movaps [mem] | 1.99-1.99 ( -0.4[%]), 0.50-0.50 ( 0.4[%])
m128 movdqu [mem+1] | 1.99-2.00 ( -0.6[%]), 0.50-0.50 ( 0.6[%])
m128 movdqu [mem+2MB-1] (cross page) | 0.30-0.23 ( 33.7[%]), 3.31-4.42 ( -25.2[%])
m128 movdqu [mem+63] (cross cache) | 0.94-1.00 ( -6.0[%]), 1.06-1.00 ( 6.4[%])
m128 movq->movq | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 mulps | 2.00-2.00 ( -0.0[%]), 0.50-0.50 ( 0.0[%])
m128 padd | 3.00-3.00 ( 0.0[%]), 0.33-0.33 ( -0.0[%])
m128 pclmulqdq | 1.00-0.50 ( 100.0[%]), 1.00-2.00 ( -50.0[%])
m128 pcmpestri | 0.25-0.25 ( -0.0[%]), 4.03-4.03 ( 0.0[%])
m128 pcmpestrm | 0.20-0.20 ( 0.0[%]), 5.03-5.03 ( -0.0[%])
m128 pcmpistri | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m128 pcmpistrm | 0.33-0.33 ( -0.0[%]), 3.00-3.00 ( 0.0[%])
m128 phaddd | 0.95-0.93 ( 1.7[%]), 1.05-1.07 ( -1.6[%])
m128 pinsrd | 0.95-0.92 ( 2.9[%]), 1.05-1.08 ( -2.8[%])
m128 pmovmskb | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m128 pmuldq | 2.00-2.00 ( 0.1[%]), 0.50-0.50 ( -0.1[%])
m128 pmullw | 2.00-2.00 ( 0.1[%]), 0.50-0.50 ( -0.1[%])
m128 pshufb | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 pxor | 5.54-4.50 ( 23.1[%]), 0.18-0.22 ( -18.7[%])
m128 rcpps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 rsqrtps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 shufps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 vfmapd | 2.00-2.00 ( -0.0[%]), 0.50-0.50 ( 0.0[%])
m128 vfmaps | 2.00-1.99 ( 0.1[%]), 0.50-0.50 ( -0.1[%])
m128 xorps | 5.84-4.50 ( 29.7[%]), 0.17-0.22 ( -22.9[%])
m256 gather32(<ld+ins>x8 + perm) | 0.29-0.25 ( 14.3[%]), 3.50-4.00 ( -12.5[%])
m256 gather64(<ld+ins>x4 + perm) | 0.67-0.50 ( 33.3[%]), 1.50-2.00 ( -25.0[%])
m256 movaps [mem] | 1.99-2.00 ( -0.7[%]), 0.50-0.50 ( 0.7[%])
m256 vaddps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vdivpd | 0.12-0.12 ( 0.0[%]), 8.00-8.00 ( -0.0[%])
m256 vdivps | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m256 vfmapd | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( 0.0[%])
m256 vfmaps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vgatherdpd | 0.50-0.33 ( 49.9[%]), 2.00-3.00 ( -33.3[%])
m256 vmovdqu [mem+1] | 1.99-2.00 ( -0.6[%]), 0.50-0.50 ( 0.6[%])
m256 vmovdqu [mem+2MB-1] (cross page) | 0.30-0.23 ( 32.9[%]), 3.31-4.40 ( -24.7[%])
m256 vmovdqu [mem+63] (cross cache) | 0.94-1.00 ( -6.0[%]), 1.06-1.00 ( 6.4[%])
m256 vmulps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vpaddd | 3.00-3.00 ( -0.0[%]), 0.33-0.33 ( 0.0[%])
m256 vpblendvb | 0.97-1.00 ( -3.2[%]), 1.03-1.00 ( 3.3[%])
m256 vpdpwssd | N/A-2.00 ( N/A[%]), N/A-0.50 ( N/A[%])
m256 vpdpwssds | N/A-2.00 ( N/A[%]), N/A-0.50 ( N/A[%])
m256 vperm2f128 | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpermpd | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpermps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vpgatherdd | 0.33-0.20 ( 64.9[%]), 3.03-5.00 ( -39.4[%])
m256 vpmovmskb | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpmovsxwd | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpshufb | 2.00-2.00 ( -0.0[%]), 0.50-0.50 ( 0.0[%])
m256 vpxor | 5.53-4.48 ( 23.5[%]), 0.18-0.22 ( -19.0[%])
m256 vrcpps | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m256 vrsqrtps | 1.00-1.00 ( 0.2[%]), 1.00-1.00 ( -0.2[%])
m256 vsqrtps | 0.17-0.17 ( 0.0[%]), 6.00-6.00 ( -0.0[%])
m256 vxorps | 5.54-4.50 ( 23.1[%]), 0.18-0.22 ( -18.7[%])
m512 vaddpd | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vaddps | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vfmapd | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vfmaps | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vfmaps reg, reg, [mem] | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vorpd | N/A-2.00 ( N/A[%]), N/A-0.50 ( N/A[%])
m512 vorps | N/A-2.00 ( N/A[%]), N/A-0.50 ( N/A[%])
m512 vorps reg, reg, [mem] | N/A-2.00 ( N/A[%]), N/A-0.50 ( N/A[%])
m512 vpconflictd | N/A-0.05 ( N/A[%]), N/A-19.40 ( N/A[%])
m512 vpdpwssd | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vpdpwssds | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vpermt2d | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
m512 vpexpandd | N/A-0.50 ( N/A[%]), N/A-2.00 ( N/A[%])
m512 vplzcntq | N/A-0.50 ( N/A[%]), N/A-2.00 ( N/A[%])
m512 vpternlogd | N/A-2.00 ( N/A[%]), N/A-0.50 ( N/A[%])
m512 vrcp14pd | N/A-0.50 ( N/A[%]), N/A-2.00 ( N/A[%])
m512 vshufps | N/A-1.00 ( N/A[%]), N/A-1.00 ( N/A[%])
reg64 add | 4.71-3.91 ( 20.3[%]), 0.21-0.26 ( -16.9[%])
reg64 crc32 | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
reg64 lea | 5.82-3.91 ( 48.8[%]), 0.17-0.26 ( -32.8[%])
reg64 load | 1.60-1.60 ( -0.0[%]), 0.63-0.63 ( 0.0[%])
reg64 popcnt | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
reg64 store [mem+0]->load[mem+0] | 0.81-0.80 ( 2.2[%]), 1.23-1.25 ( -2.2[%])
reg64 store [mem+0]->load[mem+1] | 0.07-0.06 ( 13.3[%]), 15.00-17.00 ( -11.8[%])
reg64 xor | 5.82-4.92 ( 18.2[%]), 0.17-0.20 ( -15.4[%])
reg64 xor dst,dst | 5.82-4.92 ( 18.2[%]), 0.17-0.20 ( -15.4[%])
Created
November 8, 2021 00:52
-
-
Save tanakamura/5af4a5446337d72cb12f5b04757e42ce to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment