Created
October 25, 2014 01:13
-
-
Save nkurz/0bea9db0cff60ead1686 to your computer and use it in GitHub Desktop.
Illustrate the performance difference between vector broadcast from memory vs load and shuffle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// cc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra broadcast.c -o broadcast | |
// works with 'gcc 4.8.2' and 'icc 14.03', but crashes with 'clang 3.4' because of alignment | |
// usage: broadcast [-r repeat] [-s size] | |
#ifdef LIKWID | |
#include <likwid.h> | |
#else | |
#define likwid_markerInit() | |
#define likwid_markerThreadInit() | |
#define likwid_markerStartRegion(name) | |
#define likwid_markerStopRegion(name) | |
#define likwid_markerClose() | |
#endif // LIKWID | |
#ifdef IACA | |
#include </opt/intel/iaca/include/iacaMarks.h> | |
#else | |
#define IACA_START | |
#define IACA_END | |
#endif // IACA | |
#include <stdlib.h> | |
#include <malloc.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <x86intrin.h> | |
#include <getopt.h> | |
typedef __m128i xmm_t; | |
typedef __m256i ymm_t; | |
#define MEM_BARRIER() __asm volatile (""::: /* pretend to clobber */ "memory") | |
uint64_t loop_movd_and_shuffle(uint32_t *inputArray, uint64_t size) { | |
xmm_t vecSum0 = {0, 0}; | |
xmm_t vecSum1 = {0, 0}; | |
xmm_t vecSum2 = {0, 0}; | |
xmm_t vecSum3 = {0, 0}; | |
xmm_t vecK, vecK0, vecK1, vecK2, vecK3; | |
for (uint64_t i = 0; i < size/4; i += 8) { | |
IACA_START; | |
vecK = _mm_loadu_si128((xmm_t *)&inputArray[i+0]); | |
vecK0 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(0,0,0,0)); | |
vecK1 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(1,1,1,1)); | |
vecK2 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(2,2,2,2)); | |
vecK3 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(3,3,3,3)); | |
vecSum0 = _mm_add_epi32(vecSum0, vecK0); | |
vecSum1 = _mm_add_epi32(vecSum1, vecK1); | |
vecSum2 = _mm_add_epi32(vecSum2, vecK2); | |
vecSum3 = _mm_add_epi32(vecSum3, vecK3); | |
vecK = _mm_loadu_si128((xmm_t *)&inputArray[i+4]); | |
vecK0 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(0,0,0,0)); | |
vecK1 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(1,1,1,1)); | |
vecK2 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(2,2,2,2)); | |
vecK3 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(3,3,3,3)); | |
vecSum0 = _mm_add_epi32(vecSum0, vecK0); | |
vecSum1 = _mm_add_epi32(vecSum1, vecK1); | |
vecSum2 = _mm_add_epi32(vecSum2, vecK2); | |
vecSum3 = _mm_add_epi32(vecSum3, vecK3); | |
IACA_END; | |
} | |
vecSum0 = _mm_add_epi32(vecSum0, vecSum1); | |
vecSum2 = _mm_add_epi32(vecSum2, vecSum3); | |
vecSum0 = _mm_add_epi32(vecSum0, vecSum2); | |
uint64_t sum = 0; | |
sum += _mm_extract_epi32(vecSum0, 0); | |
sum += _mm_extract_epi32(vecSum0, 1); | |
sum += _mm_extract_epi32(vecSum0, 2); | |
sum += _mm_extract_epi32(vecSum0, 3); | |
return sum; | |
} | |
uint64_t loop_broadcast_from_memory(uint32_t *inputArray, uint64_t size) { | |
xmm_t vecSum0 = {0, 0}; | |
xmm_t vecSum1 = {0, 0}; | |
xmm_t vecSum2 = {0, 0}; | |
xmm_t vecSum3 = {0, 0}; | |
xmm_t vecK0, vecK1, vecK2, vecK3; | |
#ifndef __AVX2__ // use float equivalent on Sandy Bridge and Ivy Bridge | |
#define _mm_broadcastd_epi32(arg) (xmm_t) _mm_broadcast_ss((float *) &(arg)) | |
#endif | |
for (uint64_t i = 0; i < size/4; i += 8) { | |
IACA_START; | |
vecK0 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+0]); | |
vecK1 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+1]); | |
vecK2 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+2]); | |
vecK3 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+3]); | |
vecSum0 = _mm_add_epi32(vecSum0, vecK0); | |
vecSum1 = _mm_add_epi32(vecSum1, vecK1); | |
vecSum2 = _mm_add_epi32(vecSum2, vecK2); | |
vecSum3 = _mm_add_epi32(vecSum3, vecK3); | |
vecK0 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+4]); | |
vecK1 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+5]); | |
vecK2 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+6]); | |
vecK3 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+7]); | |
vecSum0 = _mm_add_epi32(vecSum0, vecK0); | |
vecSum1 = _mm_add_epi32(vecSum1, vecK1); | |
vecSum2 = _mm_add_epi32(vecSum2, vecK2); | |
vecSum3 = _mm_add_epi32(vecSum3, vecK3); | |
IACA_END; | |
} | |
vecSum0 = _mm_add_epi32(vecSum0, vecSum1); | |
vecSum2 = _mm_add_epi32(vecSum2, vecSum3); | |
vecSum0 = _mm_add_epi32(vecSum0, vecSum2); | |
uint64_t sum = 0; | |
sum += _mm_extract_epi32(vecSum0, 0); | |
sum += _mm_extract_epi32(vecSum0, 1); | |
sum += _mm_extract_epi32(vecSum0, 2); | |
sum += _mm_extract_epi32(vecSum0, 3); | |
return sum; | |
} | |
int main(int argc, char **argv) { | |
uint64_t repeat = 1000 * 10; // number of times outer loop is repeated | |
uint64_t size = 1000 * 1000; // number of integers in the test array | |
int c; | |
while ((c = getopt (argc, argv, "r:s:")) != -1) { | |
switch (c) { | |
case 'r': | |
repeat = atoi(optarg); | |
break; | |
case 's': | |
size = atoi(optarg); | |
break; | |
} | |
} | |
printf("Allocating input array with '%ld' elements.\n", size); | |
printf("Repeating function call loop '%ld' times.\n", repeat); | |
uint32_t *input = memalign(64, size * sizeof(uint32_t)); | |
for (uint64_t i = 0; i < size; i++) { | |
input[i] = i % 4; // 0, 1, 2, 3, 0, 1, 2, 3, ... | |
} | |
uint64_t sum; | |
likwid_markerInit(); | |
likwid_markerThreadInit(); | |
likwid_markerStartRegion("broadcast"); | |
sum = 0; | |
for (uint64_t i = 0; i < repeat; i++) { | |
sum += loop_broadcast_from_memory(input, size); | |
MEM_BARRIER(); // prevent compiler from optimizing out the loop | |
} | |
likwid_markerStopRegion("broadcast"); | |
printf("Broadcast Sum: %ld\n", sum); | |
likwid_markerStartRegion("shuffle"); | |
sum = 0; | |
for (uint64_t i = 0; i < repeat; i++) { | |
sum += loop_movd_and_shuffle(input, size); | |
MEM_BARRIER(); // prevent compiler from optimizing out the loop | |
} | |
likwid_markerStopRegion("shuffle"); | |
printf("Shuffle Sum: %ld\n", sum); | |
likwid_markerClose(); | |
return 0; | |
} | |
#ifdef LIKWID_RESULTS | |
// gcc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra broadcast.c -o broadcast -DLIKWID -llikwid -lm -lpthread | |
// likwid-perfctr -m -g UOPS_EXECUTED_PORT_PORT_2:PMC0,UOPS_EXECUTED_PORT_PORT_3:PMC1,UOPS_EXECUTED_PORT_PORT_1:PMC2,UOPS_EXECUTED_PORT_PORT_5:PMC3 -C3 broadcast | |
Allocating input array with '1000000' elements. | |
Repeating function call loop '10000' times. | |
Broadcast Sum: 15000000000 | |
Shuffle Sum: 15000000000 | |
------------------------------------------------------------- | |
------------------------------------------------------------- | |
CPU type: Intel Core Haswell processor | |
CPU clock: 3.39 GHz | |
------------------------------------------------------------- | |
broadcast | |
===================== | |
Region: broadcast | |
===================== | |
+-------------------+----------+ | |
| Region Info | core 3 | | |
+-------------------+----------+ | |
| RDTSC Runtime [s] | 0.463913 | | |
| call count | 1 | | |
+-------------------+----------+ | |
+---------------------------+-------------+ | |
| Event | core 3 | | |
+---------------------------+-------------+ | |
| UOPS_EXECUTED_PORT_PORT_2 | 1.25005e+09 | | |
| UOPS_EXECUTED_PORT_PORT_3 | 1.25005e+09 | | |
| UOPS_EXECUTED_PORT_PORT_1 | 1.28721e+09 | | |
| UOPS_EXECUTED_PORT_PORT_5 | 1.21291e+09 | | |
| INSTR_RETIRED_ANY | 6.25031e+09 | | |
| CPU_CLK_UNHALTED_CORE | 1.57357e+09 | | |
| CPU_CLK_UNHALTED_REF | 1.57357e+09 | | |
+---------------------------+-------------+ | |
+----------------------+----------+ | |
| Metric | core 3 | | |
+----------------------+----------+ | |
| Runtime (RDTSC) [s] | 0.463913 | | |
| Runtime unhalted [s] | 0.463885 | | |
| Clock [MHz] | 3392.15 | | |
| CPI | 0.251758 | | |
+----------------------+----------+ | |
===================== | |
Region: shuffle | |
===================== | |
+-------------------+----------+ | |
| Region Info | core 3 | | |
+-------------------+----------+ | |
| RDTSC Runtime [s] | 0.783239 | | |
| call count | 1 | | |
+-------------------+----------+ | |
+---------------------------+-------------+ | |
| Event | core 3 | | |
+---------------------------+-------------+ | |
| UOPS_EXECUTED_PORT_PORT_2 | 3.12518e+08 | | |
| UOPS_EXECUTED_PORT_PORT_3 | 3.12517e+08 | | |
| UOPS_EXECUTED_PORT_PORT_1 | 2.34381e+09 | | |
| UOPS_EXECUTED_PORT_PORT_5 | 2.65637e+09 | | |
| INSTR_RETIRED_ANY | 6.87531e+09 | | |
| CPU_CLK_UNHALTED_CORE | 2.65675e+09 | | |
| CPU_CLK_UNHALTED_REF | 2.65675e+09 | | |
+---------------------------+-------------+ | |
+----------------------+----------+ | |
| Metric | core 3 | | |
+----------------------+----------+ | |
| Runtime (RDTSC) [s] | 0.783239 | | |
| Runtime unhalted [s] | 0.783206 | | |
| Clock [MHz] | 3392.15 | | |
| CPI | 0.386419 | | |
+----------------------+----------+ | |
#endif // LIKWID_RESULTS | |
#ifdef IACA_RESULTS | |
// gcc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra -c broadcast.c -DIACA -o iaca.o | |
// /opt/intel/iaca/bin/iaca -mark 0 -64 -arch HSW -analysis THROUGHPUT iaca.o | |
Intel(R) Architecture Code Analyzer Version - 2.1 | |
Analyzed File - iaca.o | |
Binary Format - 64Bit | |
Architecture - HSW | |
Analysis Type - Throughput | |
******************************************************************* | |
Intel(R) Architecture Code Analyzer Mark Number 1 | |
******************************************************************* | |
Throughput Analysis Report | |
-------------------------- | |
Block Throughput: 8.00 Cycles Throughput Bottleneck: Port1, Port5 | |
Port Binding In Cycles Per Iteration: | |
--------------------------------------------------------------------------------------- | |
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | |
--------------------------------------------------------------------------------------- | |
| Cycles | 0.0 0.0 | 8.0 | 1.0 1.0 | 1.0 1.0 | 0.0 | 8.0 | 0.0 | 0.0 | | |
--------------------------------------------------------------------------------------- | |
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0) | |
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path | |
F - Macro Fusion with the previous instruction occurred | |
* - instruction micro-ops not bound to a port | |
^ - Micro Fusion happened | |
# - ESP Tracking sync uop was issued | |
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected | |
! - instruction not supported, was not accounted in Analysis | |
| Num Of | Ports pressure in cycles | | | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | | |
--------------------------------------------------------------------------------- | |
| 1 | | | 1.0 1.0 | | | | | | | vmovdqu xmm1, xmmword ptr [rdi] | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm7, xmm1, 0x0 | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm6, xmm1, 0x55 | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm5, xmm1, 0xaa | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm1, xmm1, 0xff | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm1, xmm0, xmm1 | |
| 1 | | | | 1.0 1.0 | | | | | | vmovdqu xmm0, xmmword ptr [rdi+0x10] | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm3, xmm3, xmm7 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm4, xmm4, xmm6 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm5 | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm8, xmm0, 0x0 | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm7, xmm0, 0x55 | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm6, xmm0, 0xaa | |
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm5, xmm0, 0xff | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm3, xmm3, xmm8 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm4, xmm4, xmm7 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm6 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm0, xmm1, xmm5 | |
Total Num Of Uops: 18 | |
******************************************************************* | |
Intel(R) Architecture Code Analyzer Mark Number 2 | |
******************************************************************* | |
Throughput Analysis Report | |
-------------------------- | |
Block Throughput: 4.00 Cycles Throughput Bottleneck: FrontEnd, Port1, PORT2_AGU, Port2_DATA, PORT3_AGU, Port3_DATA, Port5 | |
Port Binding In Cycles Per Iteration: | |
--------------------------------------------------------------------------------------- | |
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | |
--------------------------------------------------------------------------------------- | |
| Cycles | 0.0 0.0 | 4.0 | 4.0 4.0 | 4.0 4.0 | 0.0 | 4.0 | 0.0 | 0.0 | | |
--------------------------------------------------------------------------------------- | |
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0) | |
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path | |
F - Macro Fusion with the previous instruction occurred | |
* - instruction micro-ops not bound to a port | |
^ - Micro Fusion happened | |
# - ESP Tracking sync uop was issued | |
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected | |
! - instruction not supported, was not accounted in Analysis | |
| Num Of | Ports pressure in cycles | | | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | | |
--------------------------------------------------------------------------------- | |
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm7, dword ptr [rdi] | |
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm6, dword ptr [rdi+0x4] | |
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm5, dword ptr [rdi+0x8] | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm7 | |
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm4, dword ptr [rdi+0xc] | |
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm3, xmm3, xmm6 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm0, xmm0, xmm5 | |
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm7, dword ptr [rdi+0x10] | |
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm1, xmm1, xmm4 | |
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm6, dword ptr [rdi+0x14] | |
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm5, dword ptr [rdi+0x18] | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm7 | |
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm4, dword ptr [rdi+0x1c] | |
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm3, xmm3, xmm6 | |
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm0, xmm0, xmm5 | |
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm1, xmm1, xmm4 | |
Total Num Of Uops: 16 | |
#endif // IACA_RESULTS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment