Skip to content

Instantly share code, notes, and snippets.

@nkurz
Created July 12, 2016 03:37
Show Gist options
  • Save nkurz/5e389a29cd1eabaae67924b28b40e7fd to your computer and use it in GitHub Desktop.
Save nkurz/5e389a29cd1eabaae67924b28b40e7fd to your computer and use it in GitHub Desktop.
Differences in macro- and micro-fusion performance Skylake vs Haswell
// gcc -g -Wall -O2 fusion.c -o fusion -DLIKWID -llikwid [may also need -lm -lpthread]
// likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
#include <x86intrin.h>
#include <stdint.h>
#include <stdio.h>
#ifdef LIKWID
#include <likwid.h>
#define MEASURE_INIT() \
do { \
likwid_markerInit(); \
likwid_markerThreadInit(); \
} while (0)
#define MEASURE_FINI() \
do { \
likwid_markerClose(); \
} while (0)
#define MEASURE(name, code) \
do { \
sum1 = sum2 = 0; \
likwid_markerStartRegion(name); \
code; \
likwid_markerStopRegion(name); \
printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2); \
} while (0)
#else // not LIKWID
#define MEASURE_INIT()
#define MEASURE_FINI()
#define MEASURE(name, code) \
do { \
sum1 = sum2 = 0; \
code; \
printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2); \
} while (0)
#endif // not LIKWID
#define ASM_TWO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max) \
__asm volatile ("1:\n" \
"add (%[IN1]), %[SUM1]\n" \
"cmp %[MAX], %[SUM1]\n" \
"jae 2f\n" \
"add (%[IN2]), %[SUM2]\n" \
"cmp %[MAX], %[SUM2]\n" \
"jb 1b\n" \
"2:" : \
[SUM1] "+&r" (sum1), \
[SUM2] "+&r" (sum2) : \
[IN1] "r" (in1), \
[IN2] "r" (in2), \
[MAX] "r" (max))
#define ASM_NO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2) \
__asm volatile ("1:\n" \
"mov (%[IN1]), %[TMP1]\n" \
"add %[TMP1], %[SUM1]\n" \
"cmp %[MAX], %[SUM1]\n" \
"jae 2f\n" \
"mov (%[IN2]), %[TMP2]\n" \
"add %[TMP2], %[SUM2]\n" \
"cmp %[MAX], %[SUM2]\n" \
"jb 1b\n" \
"2:" : \
[TMP1] "=&r" (tmp1), \
[TMP2] "=&r" (tmp2), \
[SUM1] "+&r" (sum1), \
[SUM2] "+&r" (sum2) : \
[IN1] "r" (in1), \
[IN2] "r" (in2), \
[MAX] "r" (max))
#define ASM_ONE_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp) \
__asm volatile ("1:\n" \
"add (%[IN1]), %[SUM1]\n" \
"cmp %[MAX], %[SUM1]\n" \
"jae 2f\n" \
"mov (%[IN2]), %[TMP]\n" \
"add %[TMP], %[SUM2]\n" \
"cmp %[MAX], %[SUM2]\n" \
"jb 1b\n" \
"2:" : \
[TMP] "=&r" (tmp), \
[SUM1] "+&r" (sum1), \
[SUM2] "+&r" (sum2) : \
[IN1] "r" (in1), \
[IN2] "r" (in2), \
[MAX] "r" (max))
#define ASM_ONE_MICRO_ONE_MACRO(in1, sum1, in2, sum2, max, tmp) \
__asm volatile ("1:\n" \
"add (%[IN1]), %[SUM1]\n" \
"cmp %[MAX], %[SUM1]\n" \
"mov (%[IN1]), %[TMP]\n" \
"jae 2f\n" \
"add %[TMP], %[SUM2]\n" \
"cmp %[MAX], %[SUM2]\n" \
"jb 1b\n" \
"2:" : \
[TMP] "=&r" (tmp), \
[SUM1] "+&r" (sum1), \
[SUM2] "+&r" (sum2) : \
[IN1] "r" (in1), \
[IN2] "r" (in2), \
[MAX] "r" (max))
// two separate loads and adds, two non-fused cmp then jcc
#define ASM_NO_MICRO_NO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2) \
__asm volatile ("mov (%[IN1]), %[TMP1]\n" \
"1:\n" \
"add %[TMP1], %[SUM1]\n" \
"cmp %[MAX], %[SUM1]\n" \
"mov (%[IN2]), %[TMP2]\n" \
"jae 2f\n" \
"add %[TMP2], %[SUM2]\n" \
"cmp %[MAX], %[SUM2]\n" \
"mov (%[IN1]), %[TMP1]\n" \
"jb 1b\n" \
"2:" : \
[TMP1] "=&r" (tmp1), \
[TMP2] "=&r" (tmp2), \
[SUM1] "+&r" (sum1), \
[SUM2] "+&r" (sum2) : \
[IN1] "r" (in1), \
[IN2] "r" (in2), \
[MAX] "r" (max))
int main(/* int argc, char **argv */) {
uint64_t tmp, tmp1, tmp2;
uint64_t sum1, sum2;
uint64_t in1 = 1;
uint64_t in2 = 1;
uint64_t max = 10000000;
MEASURE_INIT();
MEASURE("two_micro_two_macro", ASM_TWO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max));
MEASURE("one_micro_two_macro", ASM_ONE_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp));
MEASURE("one_micro_one_macro", ASM_ONE_MICRO_ONE_MACRO(&in1, sum1, &in2, sum2, max, tmp));
MEASURE("no_micro_two_macro", ASM_NO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2));
MEASURE("no_micro_no_macro", ASM_NO_MICRO_NO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2));
MEASURE_FINI();
return 0;
}
nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
-------------------------------------------------------------
-------------------------------------------------------------
CPU type: Intel Core Haswell processor
CPU clock: 3.39 GHz
-------------------------------------------------------------
fusion
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999
no_micro_two_macro: sum1=10000000, sum2=9999999
no_micro_no_macro: sum1=10000000, sum2=9999999
=====================
Region: two_micro_two_macro
=====================
| UOPS_ISSUED_ANY | 4.00061e+07 |
| UOPS_EXECUTED_CORE | 6.00062e+07 |
| UOPS_RETIRED_ALL | 6.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
| INSTR_RETIRED_ANY | 6.00013e+07 |
| CPU_CLK_UNHALTED_CORE | 1.7392e+07 |
=====================
Region: one_micro_two_macro
=====================
+----------------------------+-------------+
| Event | core 1 |
+----------------------------+-------------+
| UOPS_ISSUED_ANY | 5.00062e+07 |
| UOPS_EXECUTED_CORE | 6.00062e+07 |
| UOPS_RETIRED_ALL | 6.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
| INSTR_RETIRED_ANY | 7.00013e+07 |
| CPU_CLK_UNHALTED_CORE | 1.4247e+07 |
=====================
Region: one_micro_one_macro
=====================
+----------------------------+-------------+
| Event | core 1 |
+----------------------------+-------------+
| UOPS_ISSUED_ANY | 6.00065e+07 |
| UOPS_EXECUTED_CORE | 7.00065e+07 |
| UOPS_RETIRED_ALL | 7.00048e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
| INSTR_RETIRED_ANY | 7.00013e+07 |
| CPU_CLK_UNHALTED_CORE | 1.69403e+07 |
=====================
Region: no_micro_two_macro
=====================
+----------------------------+-------------+
| Event | core 1 |
+----------------------------+-------------+
| UOPS_ISSUED_ANY | 6.00062e+07 |
| UOPS_EXECUTED_CORE | 6.00062e+07 |
| UOPS_RETIRED_ALL | 6.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
| INSTR_RETIRED_ANY | 8.00013e+07 |
| CPU_CLK_UNHALTED_CORE | 1.57365e+07 |
=====================
Region: no_micro_no_macro
=====================
| UOPS_ISSUED_ANY | 8.00062e+07 |
| UOPS_EXECUTED_CORE | 8.00062e+07 |
| UOPS_RETIRED_ALL | 8.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
| INSTR_RETIRED_ANY | 8.00013e+07 |
| CPU_CLK_UNHALTED_CORE | 2.0043e+07 |
+----------------------------+-------------+
nate@skylake:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
CPU name: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
CPU type: Intel Skylake processor
CPU clock: 3.41 GHz
--------------------------------------------------------------------------------
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999
no_micro_two_macro: sum1=10000000, sum2=9999999
no_micro_no_macro: sum1=10000000, sum2=9999999
--------------------------------------------------------------------------------
================================================================================
Group 1 Custom: Region two_micro_two_macro
================================================================================
| UOPS_ISSUED_ANY | PMC0 | 4.000816e+07 |
| UOPS_EXECUTED_CORE | PMC1 | 6.000806e+07 |
| UOPS_RETIRED_ALL | PMC2 | 6.000724e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000056e+07 |
| INSTR_RETIRED_ANY | FIXC0 | 6.000540e+07 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.001363e+07 |
================================================================================
Group 1 Custom: Region one_micro_two_macro
================================================================================
| UOPS_ISSUED_ANY | PMC0 | 5.000502e+07 |
| UOPS_EXECUTED_CORE | PMC1 | 6.000506e+07 |
| UOPS_RETIRED_ALL | PMC2 | 6.000471e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000040e+07 |
| INSTR_RETIRED_ANY | FIXC0 | 7.000316e+07 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.334216e+07 |
================================================================================
Group 1 Custom: Region one_micro_one_macro
================================================================================
| UOPS_ISSUED_ANY | PMC0 | 6.000435e+07 |
| UOPS_EXECUTED_CORE | PMC1 | 7.000444e+07 |
| UOPS_RETIRED_ALL | PMC2 | 7.000445e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000039e+07 |
| INSTR_RETIRED_ANY | FIXC0 | 7.000310e+07 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.672351e+07 |
================================================================================
Group 1 Custom: Region no_micro_two_macro
================================================================================
| UOPS_ISSUED_ANY | PMC0 | 6.000429e+07 |
| UOPS_EXECUTED_CORE | PMC1 | 6.000438e+07 |
| UOPS_RETIRED_ALL | PMC2 | 6.000438e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000039e+07 |
| INSTR_RETIRED_ANY | FIXC0 | 8.000307e+07 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.500636e+07 |
================================================================================
Group 1 Custom: Region no_micro_no_macro
================================================================================
| UOPS_ISSUED_ANY | PMC0 | 8.000476e+07 |
| UOPS_EXECUTED_CORE | PMC1 | 8.000483e+07 |
| UOPS_RETIRED_ALL | PMC2 | 8.000466e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000039e+07 |
| INSTR_RETIRED_ANY | FIXC0 | 8.000312e+07 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 2.000775e+07 |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment