Skip to content

Instantly share code, notes, and snippets.

@tanakamura
Created November 8, 2021 00:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tanakamura/5af4a5446337d72cb12f5b04757e42ce to your computer and use it in GitHub Desktop.
Save tanakamura/5af4a5446337d72cb12f5b04757e42ce to your computer and use it in GitHub Desktop.
============= LATENCY ==============================================================================
                              instruction |     IPC         (   rel[%]),     CPI         (   rel[%])
------------------------------------------+---------------------------------------------------------
    m128                            addps |    0.50-0.25    ( 100.0[%]),    2.00-4.00    ( -50.0[%])
    m128                           aesdec |    0.33-0.14    ( 133.4[%]),    3.00-7.00    ( -57.1[%])
    m128                       aesdeclast |    0.33-0.14    ( 133.4[%]),    3.00-7.00    ( -57.1[%])
    m128                           aesenc |    0.33-0.14    ( 133.3[%]),    3.00-7.00    ( -57.1[%])
    m128                       aesenclast |    0.33-0.14    ( 133.4[%]),    3.00-7.00    ( -57.1[%])
    m128                          blendps |    1.00-1.00    (   0.1[%]),    1.00-1.00    (  -0.1[%])
    m128                         blendvps |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m128                         cvtps2dq |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                            divpd |    0.08-0.08    (   0.0[%]),   13.00-13.00   (  -0.0[%])
    m128                            divps |    0.09-0.09    (   0.0[%]),   11.00-11.00   (  -0.0[%])
    m128                             dpps |    0.07-0.07    (  -0.0[%]),   14.00-14.00   (   0.0[%])
    m128                           haddps |    0.48-0.48    (   0.5[%]),    2.07-2.08    (  -0.5[%])
    m128                     loadps->movq |    0.12-0.12    (   0.0[%]),    8.00-8.00    (  -0.0[%])
    m128                     movaps [mem] |    0.12-0.12    (   0.0[%]),    8.00-8.00    (  -0.0[%])
    m128                   movdqu [mem+1] |    0.12-0.12    (   0.0[%]),    8.00-8.00    (  -0.0[%])
    m128  movdqu [mem+2MB-1] (cross page) |    0.07-0.07    (   0.0[%]),   15.00-15.00   (  -0.0[%])
    m128    movdqu [mem+63] (cross cache) |    0.07-0.07    (  -6.7[%]),   15.00-14.00   (   7.1[%])
    m128                       movq->movq |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                            mulps |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                             padd |    1.00-1.00    (   0.1[%]),    1.00-1.00    (  -0.1[%])
    m128                        pclmulqdq |    0.33-0.17    ( 100.0[%]),    3.00-6.00    ( -50.0[%])
    m128                  pcmpestri->movq |    0.08-0.08    (   0.0[%]),   13.00-13.00   (  -0.0[%])
    m128                        pcmpestrm |    0.09-0.09    (   0.8[%]),   10.71-10.79   (  -0.8[%])
    m128                  pcmpistri->movq |    0.08-0.08    (   0.0[%]),   12.00-12.00   (  -0.0[%])
    m128                        pcmpistrm |    0.11-0.11    (   1.3[%]),    8.79-8.91    (  -1.3[%])
    m128                           phaddd |    0.48-0.48    (   0.5[%]),    2.07-2.08    (  -0.5[%])
    m128                           pinsrd |    0.95-0.92    (   3.2[%]),    1.05-1.09    (  -3.1[%])
    m128                    pinsrd->pextr |    0.17-0.17    (  -0.0[%]),    6.00-6.00    (   0.0[%])
    m128                   pmovmskb->movq |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                           pmuldq |    0.20-0.20    (   0.0[%]),    5.00-5.00    (  -0.0[%])
    m128                           pmullw |    0.20-0.20    (   0.0[%]),    5.00-5.00    (  -0.0[%])
    m128                           pshufb |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m128                             pxor |    5.54-4.50    (  23.1[%]),    0.18-0.22    ( -18.7[%])
    m128                            rcpps |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                          rsqrtps |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                           shufps |    1.00-1.00    (   0.1[%]),    1.00-1.00    (  -0.1[%])
    m128                           vfmapd |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                           vfmaps |    0.25-0.25    (   0.1[%]),    4.00-4.00    (  -0.1[%])
    m128                            xorps |    5.73-4.50    (  27.4[%]),    0.17-0.22    ( -21.5[%])
    m256      gather32(<ld+ins>x8 + perm) |    0.06-0.06    (   0.6[%]),   16.27-16.36   (  -0.6[%])
    m256      gather64(<ld+ins>x4 + perm) |    0.08-0.08    (   0.2[%]),   12.43-12.45   (  -0.2[%])
    m256             movaps [mem] -> movq |    0.11-0.11    (   0.1[%]),    9.00-9.01    (  -0.1[%])
    m256                           vaddps |    0.50-0.25    ( 100.0[%]),    2.00-4.00    ( -50.0[%])
    m256                           vdivpd |    0.08-0.08    (   0.0[%]),   13.00-13.00   (  -0.0[%])
    m256                           vdivps |    0.09-0.09    (   0.1[%]),   11.00-11.01   (  -0.1[%])
    m256                           vfmapd |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m256                           vfmaps |    0.25-0.25    (  -0.0[%]),    4.00-4.00    (   0.0[%])
    m256                       vgatherdpd |    0.04-0.05    ( -13.0[%]),   23.00-20.01   (  14.9[%])
    m256          vmovdqu [mem+1] -> movq |    0.11-0.11    (   0.0[%]),    9.00-9.00    (  -0.0[%])
    m256 vmovdqu [mem+2MB-1] (cross page) -> movq |    0.06-0.06    (   0.0[%]),   16.00-16.01   (  -0.0[%])
    m256 vmovdqu [mem+63] (cross cache) -> movq |    0.06-0.07    ( -12.5[%]),   16.00-14.00   (  14.3[%])
    m256                           vmulps |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m256                           vpaddd |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
    m256                        vpblendvb |    0.33-0.50    ( -33.3[%]),    3.00-2.00    (  50.0[%])
    m256                         vpdpwssd |     N/A-0.20    (   N/A[%]),     N/A-5.00    (   N/A[%])
    m256                        vpdpwssds |     N/A-0.20    (   N/A[%]),     N/A-5.00    (   N/A[%])
    m256                       vperm2f128 |    0.33-0.33    (  -0.0[%]),    3.00-3.00    (   0.0[%])
    m256                          vpermpd |    0.33-0.33    (   0.0[%]),    3.00-3.00    (  -0.0[%])
    m256                          vpermps |    0.33-0.33    (   0.0[%]),    3.00-3.00    (  -0.0[%])
    m256                       vpgatherdd |    0.04-0.05    (  -8.3[%]),   24.00-22.00   (   9.1[%])
    m256                        vpmovsxwd |    0.33-0.33    (   0.0[%]),    3.00-3.00    (  -0.0[%])
    m256                          vpshufb |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m256                            vpxor |    5.84-4.50    (  29.7[%]),    0.17-0.22    ( -22.9[%])
    m256                           vrcpps |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m256                         vrsqrtps |    0.25-0.25    (   0.2[%]),    4.00-4.01    (  -0.2[%])
    m256                          vsqrtps |    0.08-0.08    (   0.0[%]),   12.00-12.00   (  -0.0[%])
    m256                           vxorps |    5.84-4.50    (  29.7[%]),    0.17-0.22    ( -22.9[%])
    m512                           vaddpd |     N/A-0.25    (   N/A[%]),     N/A-4.00    (   N/A[%])
    m512                           vaddps |     N/A-0.25    (   N/A[%]),     N/A-4.00    (   N/A[%])
    m512                           vfmapd |     N/A-0.25    (   N/A[%]),     N/A-4.00    (   N/A[%])
    m512                           vfmaps |     N/A-0.25    (   N/A[%]),     N/A-4.01    (   N/A[%])
    m512           vfmaps reg, reg, [mem] |     N/A-0.25    (   N/A[%]),     N/A-4.00    (   N/A[%])
    m512                            vorpd |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                            vorps |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512            vorps reg, reg, [mem] |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                      vpconflictd |     N/A-0.04    (   N/A[%]),     N/A-26.44   (   N/A[%])
    m512                         vpdpwssd |     N/A-0.20    (   N/A[%]),     N/A-5.00    (   N/A[%])
    m512                        vpdpwssds |     N/A-0.20    (   N/A[%]),     N/A-5.00    (   N/A[%])
    m512                         vpermt2d |     N/A-0.33    (   N/A[%]),     N/A-3.00    (   N/A[%])
    m512                        vpexpandd |     N/A-0.33    (   N/A[%]),     N/A-3.00    (   N/A[%])
    m512                         vplzcntq |     N/A-0.33    (   N/A[%]),     N/A-3.00    (   N/A[%])
    m512                       vpternlogd |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                         vrcp14pd |     N/A-0.17    (   N/A[%]),     N/A-6.00    (   N/A[%])
    m512                          vshufps |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
   reg64                              add |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
   reg64                            crc32 |    0.33-0.33    (   0.1[%]),    3.00-3.00    (  -0.1[%])
   reg64                              lea |    5.90-1.00    ( 490.0[%]),    0.17-1.00    ( -83.1[%])
   reg64                             load |    0.20-0.20    (   0.0[%]),    5.00-5.00    (  -0.0[%])
   reg64                           popcnt |    0.33-0.33    (   0.0[%]),    3.00-3.00    (  -0.0[%])
   reg64       store [mem+0]->load[mem+0] |    0.13-0.13    (   1.0[%]),    7.43-7.50    (  -1.0[%])
   reg64       store [mem+0]->load[mem+1] |    0.05-0.05    (   9.8[%]),   20.04-22.00   (  -8.9[%])
   reg64                              xor |    5.90-4.92    (  19.9[%]),    0.17-0.20    ( -16.6[%])
   reg64                      xor dst,dst |    5.84-4.92    (  18.7[%]),    0.17-0.20    ( -15.8[%])


============= THROUGHPUT ===========================================================================
                              instruction |     IPC         (   rel[%]),     CPI         (   rel[%])
------------------------------------------+---------------------------------------------------------
    m128                            addps |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m128                           aesdec |    2.00-1.00    ( 100.1[%]),    0.50-1.00    ( -50.0[%])
    m128                       aesdeclast |    2.00-1.00    ( 100.0[%]),    0.50-1.00    ( -50.0[%])
    m128                           aesenc |    2.00-1.00    ( 100.0[%]),    0.50-1.00    ( -50.0[%])
    m128                       aesenclast |    2.00-1.00    (  99.9[%]),    0.50-1.00    ( -50.0[%])
    m128                          blendps |    3.00-3.00    (   0.0[%]),    0.33-0.33    (  -0.0[%])
    m128                         blendvps |    2.77-2.40    (  15.4[%]),    0.36-0.42    ( -13.3[%])
    m128                         cvtps2dq |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m128                            divpd |    0.25-0.25    (   0.0[%]),    4.00-4.00    (  -0.0[%])
    m128                            divps |    0.33-0.33    (   0.0[%]),    3.00-3.00    (  -0.0[%])
    m128                             dpps |    0.25-0.25    (   0.0[%]),    4.03-4.03    (  -0.0[%])
    m128                           haddps |    0.95-0.93    (   1.7[%]),    1.05-1.07    (  -1.6[%])
    m128                           loadps |    1.99-2.00    (  -0.6[%]),    0.50-0.50    (   0.6[%])
    m128                     movaps [mem] |    1.99-1.99    (  -0.4[%]),    0.50-0.50    (   0.4[%])
    m128                   movdqu [mem+1] |    1.99-2.00    (  -0.6[%]),    0.50-0.50    (   0.6[%])
    m128  movdqu [mem+2MB-1] (cross page) |    0.30-0.23    (  33.7[%]),    3.31-4.42    ( -25.2[%])
    m128    movdqu [mem+63] (cross cache) |    0.94-1.00    (  -6.0[%]),    1.06-1.00    (   6.4[%])
    m128                       movq->movq |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m128                            mulps |    2.00-2.00    (  -0.0[%]),    0.50-0.50    (   0.0[%])
    m128                             padd |    3.00-3.00    (   0.0[%]),    0.33-0.33    (  -0.0[%])
    m128                        pclmulqdq |    1.00-0.50    ( 100.0[%]),    1.00-2.00    ( -50.0[%])
    m128                        pcmpestri |    0.25-0.25    (  -0.0[%]),    4.03-4.03    (   0.0[%])
    m128                        pcmpestrm |    0.20-0.20    (   0.0[%]),    5.03-5.03    (  -0.0[%])
    m128                        pcmpistri |    0.33-0.33    (   0.0[%]),    3.00-3.00    (  -0.0[%])
    m128                        pcmpistrm |    0.33-0.33    (  -0.0[%]),    3.00-3.00    (   0.0[%])
    m128                           phaddd |    0.95-0.93    (   1.7[%]),    1.05-1.07    (  -1.6[%])
    m128                           pinsrd |    0.95-0.92    (   2.9[%]),    1.05-1.08    (  -2.8[%])
    m128                         pmovmskb |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
    m128                           pmuldq |    2.00-2.00    (   0.1[%]),    0.50-0.50    (  -0.1[%])
    m128                           pmullw |    2.00-2.00    (   0.1[%]),    0.50-0.50    (  -0.1[%])
    m128                           pshufb |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m128                             pxor |    5.54-4.50    (  23.1[%]),    0.18-0.22    ( -18.7[%])
    m128                            rcpps |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m128                          rsqrtps |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m128                           shufps |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m128                           vfmapd |    2.00-2.00    (  -0.0[%]),    0.50-0.50    (   0.0[%])
    m128                           vfmaps |    2.00-1.99    (   0.1[%]),    0.50-0.50    (  -0.1[%])
    m128                            xorps |    5.84-4.50    (  29.7[%]),    0.17-0.22    ( -22.9[%])
    m256      gather32(<ld+ins>x8 + perm) |    0.29-0.25    (  14.3[%]),    3.50-4.00    ( -12.5[%])
    m256      gather64(<ld+ins>x4 + perm) |    0.67-0.50    (  33.3[%]),    1.50-2.00    ( -25.0[%])
    m256                     movaps [mem] |    1.99-2.00    (  -0.7[%]),    0.50-0.50    (   0.7[%])
    m256                           vaddps |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m256                           vdivpd |    0.12-0.12    (   0.0[%]),    8.00-8.00    (  -0.0[%])
    m256                           vdivps |    0.20-0.20    (   0.0[%]),    5.00-5.00    (  -0.0[%])
    m256                           vfmapd |    2.00-2.00    (   0.0[%]),    0.50-0.50    (   0.0[%])
    m256                           vfmaps |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m256                       vgatherdpd |    0.50-0.33    (  49.9[%]),    2.00-3.00    ( -33.3[%])
    m256                  vmovdqu [mem+1] |    1.99-2.00    (  -0.6[%]),    0.50-0.50    (   0.6[%])
    m256 vmovdqu [mem+2MB-1] (cross page) |    0.30-0.23    (  32.9[%]),    3.31-4.40    ( -24.7[%])
    m256   vmovdqu [mem+63] (cross cache) |    0.94-1.00    (  -6.0[%]),    1.06-1.00    (   6.4[%])
    m256                           vmulps |    2.00-2.00    (   0.0[%]),    0.50-0.50    (  -0.0[%])
    m256                           vpaddd |    3.00-3.00    (  -0.0[%]),    0.33-0.33    (   0.0[%])
    m256                        vpblendvb |    0.97-1.00    (  -3.2[%]),    1.03-1.00    (   3.3[%])
    m256                         vpdpwssd |     N/A-2.00    (   N/A[%]),     N/A-0.50    (   N/A[%])
    m256                        vpdpwssds |     N/A-2.00    (   N/A[%]),     N/A-0.50    (   N/A[%])
    m256                       vperm2f128 |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
    m256                          vpermpd |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
    m256                          vpermps |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
    m256                       vpgatherdd |    0.33-0.20    (  64.9[%]),    3.03-5.00    ( -39.4[%])
    m256                        vpmovmskb |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
    m256                        vpmovsxwd |    1.00-1.00    (  -0.0[%]),    1.00-1.00    (   0.0[%])
    m256                          vpshufb |    2.00-2.00    (  -0.0[%]),    0.50-0.50    (   0.0[%])
    m256                            vpxor |    5.53-4.48    (  23.5[%]),    0.18-0.22    ( -19.0[%])
    m256                           vrcpps |    1.00-1.00    (   0.1[%]),    1.00-1.00    (  -0.1[%])
    m256                         vrsqrtps |    1.00-1.00    (   0.2[%]),    1.00-1.00    (  -0.2[%])
    m256                          vsqrtps |    0.17-0.17    (   0.0[%]),    6.00-6.00    (  -0.0[%])
    m256                           vxorps |    5.54-4.50    (  23.1[%]),    0.18-0.22    ( -18.7[%])
    m512                           vaddpd |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                           vaddps |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                           vfmapd |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                           vfmaps |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512           vfmaps reg, reg, [mem] |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                            vorpd |     N/A-2.00    (   N/A[%]),     N/A-0.50    (   N/A[%])
    m512                            vorps |     N/A-2.00    (   N/A[%]),     N/A-0.50    (   N/A[%])
    m512            vorps reg, reg, [mem] |     N/A-2.00    (   N/A[%]),     N/A-0.50    (   N/A[%])
    m512                      vpconflictd |     N/A-0.05    (   N/A[%]),     N/A-19.40   (   N/A[%])
    m512                         vpdpwssd |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                        vpdpwssds |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                         vpermt2d |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
    m512                        vpexpandd |     N/A-0.50    (   N/A[%]),     N/A-2.00    (   N/A[%])
    m512                         vplzcntq |     N/A-0.50    (   N/A[%]),     N/A-2.00    (   N/A[%])
    m512                       vpternlogd |     N/A-2.00    (   N/A[%]),     N/A-0.50    (   N/A[%])
    m512                         vrcp14pd |     N/A-0.50    (   N/A[%]),     N/A-2.00    (   N/A[%])
    m512                          vshufps |     N/A-1.00    (   N/A[%]),     N/A-1.00    (   N/A[%])
   reg64                              add |    4.71-3.91    (  20.3[%]),    0.21-0.26    ( -16.9[%])
   reg64                            crc32 |    1.00-1.00    (   0.0[%]),    1.00-1.00    (  -0.0[%])
   reg64                              lea |    5.82-3.91    (  48.8[%]),    0.17-0.26    ( -32.8[%])
   reg64                             load |    1.60-1.60    (  -0.0[%]),    0.63-0.63    (   0.0[%])
   reg64                           popcnt |    1.00-1.00    (   0.1[%]),    1.00-1.00    (  -0.1[%])
   reg64       store [mem+0]->load[mem+0] |    0.81-0.80    (   2.2[%]),    1.23-1.25    (  -2.2[%])
   reg64       store [mem+0]->load[mem+1] |    0.07-0.06    (  13.3[%]),   15.00-17.00   ( -11.8[%])
   reg64                              xor |    5.82-4.92    (  18.2[%]),    0.17-0.20    ( -15.4[%])
   reg64                      xor dst,dst |    5.82-4.92    (  18.2[%]),    0.17-0.20    ( -15.4[%])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment