Created
July 12, 2016 03:37
-
-
Save nkurz/5e389a29cd1eabaae67924b28b40e7fd to your computer and use it in GitHub Desktop.
Differences in macro- and micro-fusion performance Skylake vs Haswell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -g -Wall -O2 fusion.c -o fusion -DLIKWID -llikwid [may also need -lm -lpthread] | |
// likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion | |
#include <x86intrin.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#ifdef LIKWID | |
#include <likwid.h> | |
#define MEASURE_INIT() \ | |
do { \ | |
likwid_markerInit(); \ | |
likwid_markerThreadInit(); \ | |
} while (0) | |
#define MEASURE_FINI() \ | |
do { \ | |
likwid_markerClose(); \ | |
} while (0) | |
#define MEASURE(name, code) \ | |
do { \ | |
sum1 = sum2 = 0; \ | |
likwid_markerStartRegion(name); \ | |
code; \ | |
likwid_markerStopRegion(name); \ | |
printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2); \ | |
} while (0) | |
#else // not LIKWID | |
#define MEASURE_INIT() | |
#define MEASURE_FINI() | |
#define MEASURE(name, code) \ | |
do { \ | |
sum1 = sum2 = 0; \ | |
code; \ | |
printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2); \ | |
} while (0) | |
#endif // not LIKWID | |
#define ASM_TWO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max) \ | |
__asm volatile ("1:\n" \ | |
"add (%[IN1]), %[SUM1]\n" \ | |
"cmp %[MAX], %[SUM1]\n" \ | |
"jae 2f\n" \ | |
"add (%[IN2]), %[SUM2]\n" \ | |
"cmp %[MAX], %[SUM2]\n" \ | |
"jb 1b\n" \ | |
"2:" : \ | |
[SUM1] "+&r" (sum1), \ | |
[SUM2] "+&r" (sum2) : \ | |
[IN1] "r" (in1), \ | |
[IN2] "r" (in2), \ | |
[MAX] "r" (max)) | |
#define ASM_NO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2) \ | |
__asm volatile ("1:\n" \ | |
"mov (%[IN1]), %[TMP1]\n" \ | |
"add %[TMP1], %[SUM1]\n" \ | |
"cmp %[MAX], %[SUM1]\n" \ | |
"jae 2f\n" \ | |
"mov (%[IN2]), %[TMP2]\n" \ | |
"add %[TMP2], %[SUM2]\n" \ | |
"cmp %[MAX], %[SUM2]\n" \ | |
"jb 1b\n" \ | |
"2:" : \ | |
[TMP1] "=&r" (tmp1), \ | |
[TMP2] "=&r" (tmp2), \ | |
[SUM1] "+&r" (sum1), \ | |
[SUM2] "+&r" (sum2) : \ | |
[IN1] "r" (in1), \ | |
[IN2] "r" (in2), \ | |
[MAX] "r" (max)) | |
#define ASM_ONE_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp) \ | |
__asm volatile ("1:\n" \ | |
"add (%[IN1]), %[SUM1]\n" \ | |
"cmp %[MAX], %[SUM1]\n" \ | |
"jae 2f\n" \ | |
"mov (%[IN2]), %[TMP]\n" \ | |
"add %[TMP], %[SUM2]\n" \ | |
"cmp %[MAX], %[SUM2]\n" \ | |
"jb 1b\n" \ | |
"2:" : \ | |
[TMP] "=&r" (tmp), \ | |
[SUM1] "+&r" (sum1), \ | |
[SUM2] "+&r" (sum2) : \ | |
[IN1] "r" (in1), \ | |
[IN2] "r" (in2), \ | |
[MAX] "r" (max)) | |
#define ASM_ONE_MICRO_ONE_MACRO(in1, sum1, in2, sum2, max, tmp) \ | |
__asm volatile ("1:\n" \ | |
"add (%[IN1]), %[SUM1]\n" \ | |
"cmp %[MAX], %[SUM1]\n" \ | |
"mov (%[IN1]), %[TMP]\n" \ | |
"jae 2f\n" \ | |
"add %[TMP], %[SUM2]\n" \ | |
"cmp %[MAX], %[SUM2]\n" \ | |
"jb 1b\n" \ | |
"2:" : \ | |
[TMP] "=&r" (tmp), \ | |
[SUM1] "+&r" (sum1), \ | |
[SUM2] "+&r" (sum2) : \ | |
[IN1] "r" (in1), \ | |
[IN2] "r" (in2), \ | |
[MAX] "r" (max)) | |
// two separate loads and adds, two non-fused cmp then jcc | |
#define ASM_NO_MICRO_NO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2) \ | |
__asm volatile ("mov (%[IN1]), %[TMP1]\n" \ | |
"1:\n" \ | |
"add %[TMP1], %[SUM1]\n" \ | |
"cmp %[MAX], %[SUM1]\n" \ | |
"mov (%[IN2]), %[TMP2]\n" \ | |
"jae 2f\n" \ | |
"add %[TMP2], %[SUM2]\n" \ | |
"cmp %[MAX], %[SUM2]\n" \ | |
"mov (%[IN1]), %[TMP1]\n" \ | |
"jb 1b\n" \ | |
"2:" : \ | |
[TMP1] "=&r" (tmp1), \ | |
[TMP2] "=&r" (tmp2), \ | |
[SUM1] "+&r" (sum1), \ | |
[SUM2] "+&r" (sum2) : \ | |
[IN1] "r" (in1), \ | |
[IN2] "r" (in2), \ | |
[MAX] "r" (max)) | |
int main(/* int argc, char **argv */) { | |
uint64_t tmp, tmp1, tmp2; | |
uint64_t sum1, sum2; | |
uint64_t in1 = 1; | |
uint64_t in2 = 1; | |
uint64_t max = 10000000; | |
MEASURE_INIT(); | |
MEASURE("two_micro_two_macro", ASM_TWO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max)); | |
MEASURE("one_micro_two_macro", ASM_ONE_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp)); | |
MEASURE("one_micro_one_macro", ASM_ONE_MICRO_ONE_MACRO(&in1, sum1, &in2, sum2, max, tmp)); | |
MEASURE("no_micro_two_macro", ASM_NO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2)); | |
MEASURE("no_micro_no_macro", ASM_NO_MICRO_NO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2)); | |
MEASURE_FINI(); | |
return 0; | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion | |
------------------------------------------------------------- | |
------------------------------------------------------------- | |
CPU type: Intel Core Haswell processor | |
CPU clock: 3.39 GHz | |
------------------------------------------------------------- | |
fusion | |
two_micro_two_macro: sum1=10000000, sum2=9999999 | |
one_micro_two_macro: sum1=10000000, sum2=9999999 | |
one_micro_one_macro: sum1=10000000, sum2=9999999 | |
no_micro_two_macro: sum1=10000000, sum2=9999999 | |
no_micro_no_macro: sum1=10000000, sum2=9999999 | |
===================== | |
Region: two_micro_two_macro | |
===================== | |
| UOPS_ISSUED_ANY | 4.00061e+07 | | |
| UOPS_EXECUTED_CORE | 6.00062e+07 | | |
| UOPS_RETIRED_ALL | 6.00046e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 | | |
| INSTR_RETIRED_ANY | 6.00013e+07 | | |
| CPU_CLK_UNHALTED_CORE | 1.7392e+07 | | |
===================== | |
Region: one_micro_two_macro | |
===================== | |
+----------------------------+-------------+ | |
| Event | core 1 | | |
+----------------------------+-------------+ | |
| UOPS_ISSUED_ANY | 5.00062e+07 | | |
| UOPS_EXECUTED_CORE | 6.00062e+07 | | |
| UOPS_RETIRED_ALL | 6.00046e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 | | |
| INSTR_RETIRED_ANY | 7.00013e+07 | | |
| CPU_CLK_UNHALTED_CORE | 1.4247e+07 | | |
===================== | |
Region: one_micro_one_macro | |
===================== | |
+----------------------------+-------------+ | |
| Event | core 1 | | |
+----------------------------+-------------+ | |
| UOPS_ISSUED_ANY | 6.00065e+07 | | |
| UOPS_EXECUTED_CORE | 7.00065e+07 | | |
| UOPS_RETIRED_ALL | 7.00048e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 | | |
| INSTR_RETIRED_ANY | 7.00013e+07 | | |
| CPU_CLK_UNHALTED_CORE | 1.69403e+07 | | |
===================== | |
Region: no_micro_two_macro | |
===================== | |
+----------------------------+-------------+ | |
| Event | core 1 | | |
+----------------------------+-------------+ | |
| UOPS_ISSUED_ANY | 6.00062e+07 | | |
| UOPS_EXECUTED_CORE | 6.00062e+07 | | |
| UOPS_RETIRED_ALL | 6.00046e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 | | |
| INSTR_RETIRED_ANY | 8.00013e+07 | | |
| CPU_CLK_UNHALTED_CORE | 1.57365e+07 | | |
===================== | |
Region: no_micro_no_macro | |
===================== | |
| UOPS_ISSUED_ANY | 8.00062e+07 | | |
| UOPS_EXECUTED_CORE | 8.00062e+07 | | |
| UOPS_RETIRED_ALL | 8.00046e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 | | |
| INSTR_RETIRED_ANY | 8.00013e+07 | | |
| CPU_CLK_UNHALTED_CORE | 2.0043e+07 | | |
+----------------------------+-------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nate@skylake:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion | |
CPU name: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz | |
CPU type: Intel Skylake processor | |
CPU clock: 3.41 GHz | |
-------------------------------------------------------------------------------- | |
two_micro_two_macro: sum1=10000000, sum2=9999999 | |
one_micro_two_macro: sum1=10000000, sum2=9999999 | |
one_micro_one_macro: sum1=10000000, sum2=9999999 | |
no_micro_two_macro: sum1=10000000, sum2=9999999 | |
no_micro_no_macro: sum1=10000000, sum2=9999999 | |
-------------------------------------------------------------------------------- | |
================================================================================ | |
Group 1 Custom: Region two_micro_two_macro | |
================================================================================ | |
| UOPS_ISSUED_ANY | PMC0 | 4.000816e+07 | | |
| UOPS_EXECUTED_CORE | PMC1 | 6.000806e+07 | | |
| UOPS_RETIRED_ALL | PMC2 | 6.000724e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000056e+07 | | |
| INSTR_RETIRED_ANY | FIXC0 | 6.000540e+07 | | |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.001363e+07 | | |
================================================================================ | |
Group 1 Custom: Region one_micro_two_macro | |
================================================================================ | |
| UOPS_ISSUED_ANY | PMC0 | 5.000502e+07 | | |
| UOPS_EXECUTED_CORE | PMC1 | 6.000506e+07 | | |
| UOPS_RETIRED_ALL | PMC2 | 6.000471e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000040e+07 | | |
| INSTR_RETIRED_ANY | FIXC0 | 7.000316e+07 | | |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.334216e+07 | | |
================================================================================ | |
Group 1 Custom: Region one_micro_one_macro | |
================================================================================ | |
| UOPS_ISSUED_ANY | PMC0 | 6.000435e+07 | | |
| UOPS_EXECUTED_CORE | PMC1 | 7.000444e+07 | | |
| UOPS_RETIRED_ALL | PMC2 | 7.000445e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000039e+07 | | |
| INSTR_RETIRED_ANY | FIXC0 | 7.000310e+07 | | |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.672351e+07 | | |
================================================================================ | |
Group 1 Custom: Region no_micro_two_macro | |
================================================================================ | |
| UOPS_ISSUED_ANY | PMC0 | 6.000429e+07 | | |
| UOPS_EXECUTED_CORE | PMC1 | 6.000438e+07 | | |
| UOPS_RETIRED_ALL | PMC2 | 6.000438e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000039e+07 | | |
| INSTR_RETIRED_ANY | FIXC0 | 8.000307e+07 | | |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 1.500636e+07 | | |
================================================================================ | |
Group 1 Custom: Region no_micro_no_macro | |
================================================================================ | |
| UOPS_ISSUED_ANY | PMC0 | 8.000476e+07 | | |
| UOPS_EXECUTED_CORE | PMC1 | 8.000483e+07 | | |
| UOPS_RETIRED_ALL | PMC2 | 8.000466e+07 | | |
| BR_INST_RETIRED_NEAR_TAKEN | PMC3 | 1.000039e+07 | | |
| INSTR_RETIRED_ANY | FIXC0 | 8.000312e+07 | | |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 2.000775e+07 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment