Skip to content

Instantly share code, notes, and snippets.

@tanakamura
Last active July 13, 2019 10:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tanakamura/3b906a73f9c09cd33a5de5309a0a9a25 to your computer and use it in GitHub Desktop.
Save tanakamura/3b906a73f9c09cd33a5de5309a0a9a25 to your computer and use it in GitHub Desktop.
Ryzen 7 3700x vs i7-6700
============= LATENCY ==============================================================================
instruction | IPC ( rel[%]), CPI ( rel[%])
------------------------------------------+---------------------------------------------------------
m128 addps | 0.33-0.25 ( 33.3[%]), 3.00-4.00 ( -25.0[%])
m128 aesdec | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 aesdeclast | 0.25-0.25 ( -0.0[%]), 4.00-4.00 ( 0.0[%])
m128 aesenc | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 aesenclast | 0.25-0.25 ( -0.0[%]), 4.00-4.00 ( 0.0[%])
m128 blendps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 blendvps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 cvtps2dq | 0.33-0.25 ( 33.4[%]), 3.00-4.00 ( -25.0[%])
m128 divpd | 0.08-0.08 ( 0.0[%]), 13.00-13.00 ( -0.0[%])
m128 divps | 0.10-0.09 ( 10.0[%]), 10.00-11.00 ( -9.1[%])
m128 dpps | 0.07-0.08 ( -13.3[%]), 15.00-13.00 ( 15.4[%])
m128 haddps | 0.50-0.33 ( 50.0[%]), 2.00-3.00 ( -33.3[%])
m128 loadps->movq | 0.11-0.12 ( -11.1[%]), 9.00-8.00 ( 12.5[%])
m128 movaps [mem] | 0.11-0.12 ( -11.1[%]), 9.00-8.00 ( 12.5[%])
m128 movdqu [mem+1] | 0.10-0.12 ( -20.0[%]), 10.00-8.00 ( 25.0[%])
m128 movdqu [mem+2MB-1] (cross page) | 0.09-0.07 ( 36.4[%]), 11.00-15.00 ( -26.7[%])
m128 movdqu [mem+63] (cross cache) | 0.09-0.07 ( 27.3[%]), 11.00-14.00 ( -21.4[%])
m128 movq->movq | 0.17-0.25 ( -33.3[%]), 6.00-4.00 ( 50.0[%])
m128 mulps | 0.33-0.25 ( 33.4[%]), 3.00-4.00 ( -25.0[%])
m128 padd | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 pclmulqdq | 0.22-0.14 ( 53.0[%]), 4.58-7.00 ( -34.6[%])
m128 pcmpestri->movq | 0.09-0.08 ( 9.1[%]), 11.00-12.00 ( -8.3[%])
m128 pcmpestrm | 0.14-0.11 ( 22.8[%]), 7.33-9.00 ( -18.5[%])
m128 pcmpistri->movq | 0.09-0.08 ( 9.1[%]), 11.00-12.00 ( -8.3[%])
m128 pcmpistrm | 0.14-0.11 ( 26.5[%]), 7.00-8.85 ( -20.9[%])
m128 phaddd | 0.50-0.33 ( 50.1[%]), 2.00-3.00 ( -33.4[%])
m128 pinsrd | 0.56-0.50 ( 11.9[%]), 1.79-2.00 ( -10.6[%])
m128 pinsrd->pextr | 0.12-0.17 ( -25.0[%]), 8.00-6.00 ( 33.3[%])
m128 pmovmskb->movq | 0.17-0.25 ( -33.3[%]), 6.00-4.00 ( 49.9[%])
m128 pmuldq | 0.33-0.20 ( 66.7[%]), 3.00-5.00 ( -40.0[%])
m128 pmullw | 0.33-0.20 ( 66.7[%]), 3.00-5.00 ( -40.0[%])
m128 pshufb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 pxor | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
m128 rcpps | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m128 rsqrtps | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m128 shufps | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m128 vfmapd | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m128 vfmaps | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m128 xorps | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
m256 gather32(<ld+ins>x8 + perm) | 0.05-0.06 ( -4.0[%]), 18.75-18.00 ( 4.2[%])
m256 gather64(<ld+ins>x4 + perm) | 0.07-0.08 ( -12.5[%]), 14.86-13.00 ( 14.3[%])
m256 movaps [mem] | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vaddps | 0.33-0.25 ( 33.4[%]), 3.00-4.00 ( -25.0[%])
m256 vdivpd | 0.08-0.08 ( 0.0[%]), 13.00-13.00 ( -0.0[%])
m256 vdivps | 0.10-0.09 ( 10.0[%]), 10.00-11.00 ( -9.1[%])
m256 vfmapd | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m256 vfmaps | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m256 vgatherdpd | 0.05-0.05 ( 5.3[%]), 19.00-20.00 ( -5.0[%])
m256 vmovdqu [mem+1] | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vmovdqu [mem+2MB-1] (cross page) | 1.00-0.26 ( 290.3[%]), 1.00-3.90 ( -74.4[%])
m256 vmovdqu [mem+63] (cross cache) | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vmulps | 0.33-0.25 ( 33.4[%]), 3.00-4.00 ( -25.0[%])
m256 vpaddd | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpblendvb | 1.00-0.50 ( 100.0[%]), 1.00-2.00 ( -50.0[%])
m256 vperm2f128 | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vpermpd | 0.17-0.33 ( -50.0[%]), 6.00-3.00 ( 100.0[%])
m256 vpermps | 0.12-0.33 ( -62.5[%]), 8.00-3.00 ( 166.5[%])
m256 vpgatherdd | 0.04-0.05 ( -8.3[%]), 24.00-22.00 ( 9.1[%])
m256 vpmovsxwd | 0.25-0.33 ( -25.0[%]), 4.00-3.00 ( 33.3[%])
m256 vpshufb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vpxor | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
m256 vrcpps | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m256 vrsqrtps | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m256 vsqrtps | 0.07-0.08 ( -14.3[%]), 14.00-12.00 ( 16.6[%])
m256 vxorps | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
reg64 add | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
reg64 crc32 | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
reg64 lea | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
reg64 load | 0.25-0.20 ( 25.0[%]), 4.00-5.00 ( -20.0[%])
reg64 popcnt | 1.00-0.33 ( 200.0[%]), 1.00-3.00 ( -66.7[%])
reg64 store [mem+0]->load[mem+0] | 0.03-0.13 ( -79.4[%]), 37.56-7.75 ( 384.8[%])
reg64 store [mem+0]->load[mem+1] | 0.03-0.05 ( -49.2[%]), 37.38-19.00 ( 96.7[%])
reg64 xor | 3.88-3.94 ( -1.5[%]), 0.26-0.25 ( 1.6[%])
reg64 xor dst,dst | 3.88-3.93 ( -1.3[%]), 0.26-0.25 ( 1.3[%])
============= THROUGHPUT ===========================================================================
instruction | IPC ( rel[%]), CPI ( rel[%])
------------------------------------------+---------------------------------------------------------
m128 addps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 aesdec | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m128 aesdeclast | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m128 aesenc | 2.00-1.00 ( 100.4[%]), 0.50-1.00 ( -50.1[%])
m128 aesenclast | 2.00-1.00 ( 100.4[%]), 0.50-1.00 ( -50.1[%])
m128 blendps | 3.00-3.00 ( 0.0[%]), 0.33-0.33 ( -0.0[%])
m128 blendvps | 2.00-1.97 ( 1.4[%]), 0.50-0.51 ( -1.4[%])
m128 cvtps2dq | 1.00-2.00 ( -50.0[%]), 1.00-0.50 ( 100.0[%])
m128 divpd | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m128 divps | 0.29-0.33 ( -14.3[%]), 3.50-3.00 ( 16.6[%])
m128 dpps | 0.25-0.63 ( -60.0[%]), 4.00-1.60 ( 150.3[%])
m128 haddps | 0.50-0.50 ( -0.0[%]), 2.00-2.00 ( 0.0[%])
m128 loadps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 movaps [mem] | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 movdqu [mem+1] | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 movdqu [mem+2MB-1] (cross page) | 1.00-0.26 ( 289.8[%]), 1.00-3.90 ( -74.3[%])
m128 movdqu [mem+63] (cross cache) | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m128 movq->movq | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 mulps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 padd | 3.00-3.00 ( 0.0[%]), 0.33-0.33 ( -0.0[%])
m128 pclmulqdq | 0.50-1.00 ( -50.0[%]), 2.00-1.00 ( 100.0[%])
m128 pcmpestri | 0.33-0.25 ( 34.3[%]), 3.00-4.03 ( -25.5[%])
m128 pcmpestrm | 0.33-0.20 ( 67.6[%]), 3.00-5.03 ( -40.3[%])
m128 pcmpistri | 0.50-0.33 ( 50.1[%]), 2.00-3.00 ( -33.4[%])
m128 pcmpistrm | 0.50-0.33 ( 50.0[%]), 2.00-3.00 ( -33.3[%])
m128 phaddd | 0.50-0.50 ( 0.0[%]), 2.00-2.00 ( -0.0[%])
m128 pinsrd | 0.78-0.50 ( 55.8[%]), 1.28-2.00 ( -35.8[%])
m128 pmovmskb | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m128 pmuldq | 1.00-2.00 ( -50.0[%]), 1.00-0.50 ( 100.0[%])
m128 pmullw | 1.00-2.00 ( -50.0[%]), 1.00-0.50 ( 100.0[%])
m128 pshufb | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m128 pxor | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
m128 rcpps | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m128 rsqrtps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 shufps | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m128 vfmapd | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 vfmaps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 xorps | 4.00-3.89 ( 2.7[%]), 0.25-0.26 ( -2.6[%])
m256 gather32(<ld+ins>x8 + perm) | 0.25-0.14 ( 75.0[%]), 4.00-7.00 ( -42.9[%])
m256 gather64(<ld+ins>x4 + perm) | 0.50-0.33 ( 50.0[%]), 2.00-3.00 ( -33.3[%])
m256 movaps [mem] | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vaddps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vdivpd | 0.20-0.12 ( 60.0[%]), 5.00-8.00 ( -37.5[%])
m256 vdivps | 0.29-0.20 ( 42.9[%]), 3.50-5.00 ( -30.0[%])
m256 vfmapd | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vfmaps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vgatherdpd | 0.11-0.25 ( -55.5[%]), 9.00-4.00 ( 124.9[%])
m256 vmovdqu [mem+1] | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vmovdqu [mem+2MB-1] (cross page) | 1.00-0.26 ( 284.5[%]), 1.00-3.85 ( -74.0[%])
m256 vmovdqu [mem+63] (cross cache) | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vmulps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m256 vpaddd | 3.00-3.00 ( 0.0[%]), 0.33-0.33 ( -0.0[%])
m256 vpblendvb | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vperm2f128 | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpermpd | 0.78-1.00 ( -21.6[%]), 1.28-1.00 ( 27.6[%])
m256 vpermps | 0.50-1.00 ( -50.0[%]), 2.00-1.00 ( 99.9[%])
m256 vpgatherdd | 0.06-0.20 ( -68.7[%]), 16.00-5.00 ( 219.9[%])
m256 vpmovmskb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vpmovsxwd | 0.88-1.00 ( -12.3[%]), 1.14-1.00 ( 14.0[%])
m256 vpshufb | 2.00-1.00 ( 100.1[%]), 0.50-1.00 ( -50.0[%])
m256 vpxor | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
m256 vrcpps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vrsqrtps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vsqrtps | 0.18-0.17 ( 9.1[%]), 5.50-6.00 ( -8.4[%])
m256 vxorps | 4.00-3.89 ( 2.8[%]), 0.25-0.26 ( -2.7[%])
reg64 add | 3.86-3.94 ( -1.8[%]), 0.26-0.25 ( 1.9[%])
reg64 crc32 | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
reg64 lea | 3.86-2.00 ( 93.2[%]), 0.26-0.50 ( -48.2[%])
reg64 load | 1.60-1.60 ( 0.1[%]), 0.63-0.63 ( -0.1[%])
reg64 popcnt | 3.86-1.00 ( 286.4[%]), 0.26-1.00 ( -74.1[%])
reg64 store [mem+0]->load[mem+0] | 0.26-0.74 ( -65.5[%]), 3.91-1.35 ( 190.0[%])
reg64 store [mem+0]->load[mem+1] | 0.07-0.08 ( -7.1[%]), 14.00-13.00 ( 7.7[%])
reg64 xor | 3.88-3.93 ( -1.4[%]), 0.26-0.25 ( 1.4[%])
reg64 xor dst,dst | 3.88-3.94 ( -1.5[%]), 0.26-0.25 ( 1.5[%])
Ryzen 7 3700x vs Ryzen 7 1700
============= LATENCY ==============================================================================
instruction | IPC ( rel[%]), CPI ( rel[%])
------------------------------------------+---------------------------------------------------------
m128 addps | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m128 aesdec | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 aesdeclast | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 aesenc | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 aesenclast | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 blendps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 blendvps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 cvtps2dq | 0.33-0.25 ( 33.3[%]), 3.00-4.00 ( -25.0[%])
m128 divpd | 0.08-0.12 ( -38.5[%]), 13.00-8.00 ( 62.5[%])
m128 divps | 0.10-0.10 ( 0.0[%]), 10.00-10.00 ( -0.0[%])
m128 dpps | 0.07-0.07 ( 0.0[%]), 15.00-15.00 ( -0.0[%])
m128 haddps | 0.50-0.50 ( -0.0[%]), 2.00-2.00 ( 0.0[%])
m128 loadps->movq | 0.11-0.11 ( 0.0[%]), 9.00-9.00 ( -0.0[%])
m128 movaps [mem] | 0.11-0.11 ( 0.0[%]), 9.00-9.00 ( -0.0[%])
m128 movdqu [mem+1] | 0.10-0.10 ( 0.0[%]), 10.00-10.00 ( -0.0[%])
m128 movdqu [mem+2MB-1] (cross page) | 0.09-0.09 ( 0.3[%]), 11.00-11.03 ( -0.3[%])
m128 movdqu [mem+63] (cross cache) | 0.09-0.09 ( 0.1[%]), 11.00-11.01 ( -0.1[%])
m128 movq->movq | 0.17-0.17 ( 0.2[%]), 6.00-6.01 ( -0.2[%])
m128 mulps | 0.33-0.33 ( 0.3[%]), 3.00-3.01 ( -0.3[%])
m128 padd | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 pclmulqdq | 0.22-0.22 ( -1.7[%]), 4.58-4.50 ( 1.7[%])
m128 pcmpestri->movq | 0.09-0.09 ( 0.0[%]), 11.00-11.00 ( -0.0[%])
m128 pcmpestrm | 0.14-0.12 ( 15.9[%]), 7.33-8.50 ( -13.7[%])
m128 pcmpistri->movq | 0.09-0.09 ( 0.0[%]), 11.00-11.00 ( -0.0[%])
m128 pcmpistrm | 0.14-0.14 ( 0.0[%]), 7.00-7.00 ( -0.0[%])
m128 phaddd | 0.50-0.50 ( -0.0[%]), 2.00-2.00 ( 0.0[%])
m128 pinsrd | 0.56-0.59 ( -4.7[%]), 1.79-1.70 ( 5.0[%])
m128 pinsrd->pextr | 0.12-0.12 ( 0.0[%]), 8.00-8.00 ( -0.0[%])
m128 pmovmskb->movq | 0.17-0.17 ( 0.0[%]), 6.00-6.00 ( -0.0[%])
m128 pmuldq | 0.33-0.33 ( 0.1[%]), 3.00-3.00 ( -0.1[%])
m128 pmullw | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m128 pshufb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 pxor | 4.00-4.00 ( 0.0[%]), 0.25-0.25 ( -0.0[%])
m128 rcpps | 0.20-0.20 ( 0.1[%]), 5.00-5.01 ( -0.1[%])
m128 rsqrtps | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m128 shufps | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 vfmapd | 0.20-0.20 ( -0.0[%]), 5.00-5.00 ( 0.0[%])
m128 vfmaps | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m128 xorps | 4.00-4.00 ( 0.0[%]), 0.25-0.25 ( -0.0[%])
m256 gather32(<ld+ins>x8 + perm) | 0.05-0.06 ( -7.3[%]), 18.75-17.39 ( 7.8[%])
m256 gather64(<ld+ins>x4 + perm) | 0.07-0.08 ( -12.4[%]), 14.86-13.02 ( 14.2[%])
m256 movaps [mem] | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vaddps | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vdivpd | 0.08-0.12 ( -38.5[%]), 13.00-8.00 ( 62.5[%])
m256 vdivps | 0.10-0.10 ( 0.0[%]), 10.00-10.00 ( -0.0[%])
m256 vfmapd | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m256 vfmaps | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m256 vgatherdpd | 0.05-0.06 ( -4.5[%]), 19.00-18.14 ( 4.7[%])
m256 vmovdqu [mem+1] | 1.00-0.67 ( 50.0[%]), 1.00-1.50 ( -33.3[%])
m256 vmovdqu [mem+2MB-1] (cross page) | 1.00-0.67 ( 50.0[%]), 1.00-1.50 ( -33.3[%])
m256 vmovdqu [mem+63] (cross cache) | 1.00-0.67 ( 50.0[%]), 1.00-1.50 ( -33.3[%])
m256 vmulps | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vpaddd | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m256 vpblendvb | 1.00-0.50 ( 100.1[%]), 1.00-2.00 ( -50.0[%])
m256 vperm2f128 | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m256 vpermpd | 0.17-0.50 ( -66.6[%]), 6.00-2.01 ( 199.1[%])
m256 vpermps | 0.12-0.20 ( -37.5[%]), 8.00-5.00 ( 60.0[%])
m256 vpgatherdd | 0.04-0.05 ( -11.2[%]), 24.00-21.31 ( 12.6[%])
m256 vpmovsxwd | 0.25-0.50 ( -50.0[%]), 4.00-2.00 ( 100.0[%])
m256 vpshufb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vpxor | 4.00-2.00 ( 100.0[%]), 0.25-0.50 ( -50.0[%])
m256 vrcpps | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m256 vrsqrtps | 0.20-0.20 ( 0.0[%]), 5.00-5.00 ( -0.0[%])
m256 vsqrtps | 0.07-0.12 ( -42.8[%]), 14.00-8.00 ( 75.0[%])
m256 vxorps | 4.00-2.00 ( 100.0[%]), 0.25-0.50 ( -50.0[%])
reg64 add | 1.00-0.95 ( 4.9[%]), 1.00-1.05 ( -4.7[%])
reg64 crc32 | 0.33-0.33 ( 0.5[%]), 3.00-3.02 ( -0.5[%])
reg64 lea | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
reg64 load | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
reg64 popcnt | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
reg64 store [mem+0]->load[mem+0] | 0.03-0.04 ( -30.3[%]), 37.56-26.18 ( 43.5[%])
reg64 store [mem+0]->load[mem+1] | 0.03-0.03 ( -16.2[%]), 37.38-31.32 ( 19.3[%])
reg64 xor | 3.88-3.88 ( -0.0[%]), 0.26-0.26 ( 0.0[%])
reg64 xor dst,dst | 3.88-3.88 ( 0.0[%]), 0.26-0.26 ( -0.0[%])
============= THROUGHPUT ===========================================================================
instruction | IPC ( rel[%]), CPI ( rel[%])
------------------------------------------+---------------------------------------------------------
m128 addps | 2.00-2.00 ( 0.1[%]), 0.50-0.50 ( -0.1[%])
m128 aesdec | 2.00-1.95 ( 2.3[%]), 0.50-0.51 ( -2.3[%])
m128 aesdeclast | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 aesenc | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 aesenclast | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( 0.0[%])
m128 blendps | 3.00-2.00 ( 50.0[%]), 0.33-0.50 ( -33.3[%])
m128 blendvps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 cvtps2dq | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 divpd | 0.20-0.25 ( -20.0[%]), 5.00-4.00 ( 25.0[%])
m128 divps | 0.29-0.33 ( -14.3[%]), 3.50-3.00 ( 16.7[%])
m128 dpps | 0.25-0.25 ( 0.0[%]), 4.00-4.00 ( -0.0[%])
m128 haddps | 0.50-0.50 ( 0.1[%]), 2.00-2.00 ( -0.1[%])
m128 loadps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 movaps [mem] | 2.00-2.00 ( 0.1[%]), 0.50-0.50 ( -0.1[%])
m128 movdqu [mem+1] | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 movdqu [mem+2MB-1] (cross page) | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 movdqu [mem+63] (cross cache) | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m128 movq->movq | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 mulps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 padd | 3.00-3.00 ( 0.0[%]), 0.33-0.33 ( -0.0[%])
m128 pclmulqdq | 0.50-0.50 ( 0.0[%]), 2.00-2.00 ( -0.0[%])
m128 pcmpestri | 0.33-0.33 ( 0.0[%]), 3.00-3.00 ( -0.0[%])
m128 pcmpestrm | 0.33-0.33 ( 0.4[%]), 3.00-3.01 ( -0.4[%])
m128 pcmpistri | 0.50-0.50 ( -0.0[%]), 2.00-2.00 ( 0.0[%])
m128 pcmpistrm | 0.50-0.50 ( 0.0[%]), 2.00-2.00 ( -0.0[%])
m128 phaddd | 0.50-0.50 ( 0.0[%]), 2.00-2.00 ( -0.0[%])
m128 pinsrd | 0.78-0.76 ( 2.1[%]), 1.28-1.31 ( -2.0[%])
m128 pmovmskb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m128 pmuldq | 1.00-1.00 ( 0.1[%]), 1.00-1.00 ( -0.1[%])
m128 pmullw | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m128 pshufb | 2.00-2.00 ( 0.1[%]), 0.50-0.50 ( -0.1[%])
m128 pxor | 4.00-4.00 ( 0.0[%]), 0.25-0.25 ( -0.0[%])
m128 rcpps | 1.00-0.99 ( 0.9[%]), 1.00-1.01 ( -0.9[%])
m128 rsqrtps | 1.00-1.00 ( -0.0[%]), 1.00-1.00 ( 0.0[%])
m128 shufps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 vfmapd | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 vfmaps | 2.00-2.00 ( 0.0[%]), 0.50-0.50 ( -0.0[%])
m128 xorps | 4.00-4.00 ( -0.1[%]), 0.25-0.25 ( 0.1[%])
m256 gather32(<ld+ins>x8 + perm) | 0.25-0.20 ( 25.7[%]), 4.00-5.03 ( -20.4[%])
m256 gather64(<ld+ins>x4 + perm) | 0.50-0.33 ( 51.4[%]), 2.00-3.03 ( -33.9[%])
m256 movaps [mem] | 2.00-0.99 ( 101.4[%]), 0.50-1.01 ( -50.4[%])
m256 vaddps | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m256 vdivpd | 0.20-0.12 ( 60.1[%]), 5.00-8.01 ( -37.5[%])
m256 vdivps | 0.29-0.17 ( 71.4[%]), 3.50-6.00 ( -41.7[%])
m256 vfmapd | 2.00-1.00 ( 100.0[%]), 0.50-1.00 ( -50.0[%])
m256 vfmaps | 2.00-0.97 ( 106.6[%]), 0.50-1.03 ( -51.6[%])
m256 vgatherdpd | 0.11-0.08 ( 33.3[%]), 9.00-12.00 ( -25.0[%])
m256 vmovdqu [mem+1] | 2.00-0.67 ( 200.0[%]), 0.50-1.50 ( -66.7[%])
m256 vmovdqu [mem+2MB-1] (cross page) | 1.00-0.67 ( 50.0[%]), 1.00-1.50 ( -33.3[%])
m256 vmovdqu [mem+63] (cross cache) | 1.00-0.67 ( 50.0[%]), 1.00-1.50 ( -33.3[%])
m256 vmulps | 2.00-0.99 ( 102.6[%]), 0.50-1.01 ( -50.6[%])
m256 vpaddd | 3.00-1.50 ( 100.0[%]), 0.33-0.67 ( -50.0[%])
m256 vpblendvb | 1.00-0.50 ( 100.0[%]), 1.00-2.00 ( -50.0[%])
m256 vperm2f128 | 1.00-0.31 ( 220.9[%]), 1.00-3.21 ( -68.8[%])
m256 vpermpd | 0.78-0.50 ( 56.8[%]), 1.28-2.00 ( -36.2[%])
m256 vpermps | 0.50-0.25 ( 100.0[%]), 2.00-4.00 ( -50.0[%])
m256 vpgatherdd | 0.06-0.05 ( 25.0[%]), 16.00-20.00 ( -20.0[%])
m256 vpmovmskb | 1.00-1.00 ( 0.0[%]), 1.00-1.00 ( -0.0[%])
m256 vpmovsxwd | 0.88-0.50 ( 75.4[%]), 1.14-2.00 ( -43.0[%])
m256 vpshufb | 2.00-1.00 ( 100.1[%]), 0.50-1.00 ( -50.0[%])
m256 vpxor | 4.00-2.00 ( 100.0[%]), 0.25-0.50 ( -50.0[%])
m256 vrcpps | 1.00-0.50 ( 100.0[%]), 1.00-2.00 ( -50.0[%])
m256 vrsqrtps | 1.00-0.50 ( 100.0[%]), 1.00-2.00 ( -50.0[%])
m256 vsqrtps | 0.18-0.12 ( 45.5[%]), 5.50-8.00 ( -31.2[%])
m256 vxorps | 4.00-2.00 ( 100.1[%]), 0.25-0.50 ( -50.0[%])
reg64 add | 3.86-3.88 ( -0.4[%]), 0.26-0.26 ( 0.4[%])
reg64 crc32 | 1.00-0.33 ( 201.6[%]), 1.00-3.02 ( -66.8[%])
reg64 lea | 3.86-3.88 ( -0.4[%]), 0.26-0.26 ( 0.4[%])
reg64 load | 1.60-1.60 ( 0.0[%]), 0.63-0.63 ( -0.0[%])
reg64 popcnt | 3.86-3.88 ( -0.4[%]), 0.26-0.26 ( 0.4[%])
reg64 store [mem+0]->load[mem+0] | 0.26-0.36 ( -29.9[%]), 3.91-2.75 ( 42.6[%])
reg64 store [mem+0]->load[mem+1] | 0.07-0.06 ( 21.4[%]), 14.00-17.00 ( -17.7[%])
reg64 xor | 3.88-3.69 ( 5.1[%]), 0.26-0.27 ( -4.8[%])
reg64 xor dst,dst | 3.88-3.87 ( 0.1[%]), 0.26-0.26 ( -0.1[%])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment