Skip to content

Instantly share code, notes, and snippets.

@nkurz
Created October 25, 2014 01:13
Show Gist options
  • Save nkurz/0bea9db0cff60ead1686 to your computer and use it in GitHub Desktop.
Save nkurz/0bea9db0cff60ead1686 to your computer and use it in GitHub Desktop.
Illustrate the performance difference between vector broadcast from memory vs load and shuffle
// cc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra broadcast.c -o broadcast
// works with 'gcc 4.8.2' and 'icc 14.03', but crashes with 'clang 3.4' because of alignment
// usage: broadcast [-r repeat] [-s size]
#ifdef LIKWID
#include <likwid.h>
#else
#define likwid_markerInit()
#define likwid_markerThreadInit()
#define likwid_markerStartRegion(name)
#define likwid_markerStopRegion(name)
#define likwid_markerClose()
#endif // LIKWID
#ifdef IACA
#include </opt/intel/iaca/include/iacaMarks.h>
#else
#define IACA_START
#define IACA_END
#endif // IACA
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <stdint.h>
#include <x86intrin.h>
#include <getopt.h>
typedef __m128i xmm_t;
typedef __m256i ymm_t;
#define MEM_BARRIER() __asm volatile (""::: /* pretend to clobber */ "memory")
uint64_t loop_movd_and_shuffle(uint32_t *inputArray, uint64_t size) {
xmm_t vecSum0 = {0, 0};
xmm_t vecSum1 = {0, 0};
xmm_t vecSum2 = {0, 0};
xmm_t vecSum3 = {0, 0};
xmm_t vecK, vecK0, vecK1, vecK2, vecK3;
for (uint64_t i = 0; i < size/4; i += 8) {
IACA_START;
vecK = _mm_loadu_si128((xmm_t *)&inputArray[i+0]);
vecK0 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(0,0,0,0));
vecK1 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(1,1,1,1));
vecK2 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(2,2,2,2));
vecK3 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(3,3,3,3));
vecSum0 = _mm_add_epi32(vecSum0, vecK0);
vecSum1 = _mm_add_epi32(vecSum1, vecK1);
vecSum2 = _mm_add_epi32(vecSum2, vecK2);
vecSum3 = _mm_add_epi32(vecSum3, vecK3);
vecK = _mm_loadu_si128((xmm_t *)&inputArray[i+4]);
vecK0 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(0,0,0,0));
vecK1 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(1,1,1,1));
vecK2 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(2,2,2,2));
vecK3 = _mm_shuffle_epi32(vecK, _MM_SHUFFLE(3,3,3,3));
vecSum0 = _mm_add_epi32(vecSum0, vecK0);
vecSum1 = _mm_add_epi32(vecSum1, vecK1);
vecSum2 = _mm_add_epi32(vecSum2, vecK2);
vecSum3 = _mm_add_epi32(vecSum3, vecK3);
IACA_END;
}
vecSum0 = _mm_add_epi32(vecSum0, vecSum1);
vecSum2 = _mm_add_epi32(vecSum2, vecSum3);
vecSum0 = _mm_add_epi32(vecSum0, vecSum2);
uint64_t sum = 0;
sum += _mm_extract_epi32(vecSum0, 0);
sum += _mm_extract_epi32(vecSum0, 1);
sum += _mm_extract_epi32(vecSum0, 2);
sum += _mm_extract_epi32(vecSum0, 3);
return sum;
}
uint64_t loop_broadcast_from_memory(uint32_t *inputArray, uint64_t size) {
xmm_t vecSum0 = {0, 0};
xmm_t vecSum1 = {0, 0};
xmm_t vecSum2 = {0, 0};
xmm_t vecSum3 = {0, 0};
xmm_t vecK0, vecK1, vecK2, vecK3;
#ifndef __AVX2__ // use float equivalent on Sandy Bridge and Ivy Bridge
#define _mm_broadcastd_epi32(arg) (xmm_t) _mm_broadcast_ss((float *) &(arg))
#endif
for (uint64_t i = 0; i < size/4; i += 8) {
IACA_START;
vecK0 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+0]);
vecK1 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+1]);
vecK2 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+2]);
vecK3 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+3]);
vecSum0 = _mm_add_epi32(vecSum0, vecK0);
vecSum1 = _mm_add_epi32(vecSum1, vecK1);
vecSum2 = _mm_add_epi32(vecSum2, vecK2);
vecSum3 = _mm_add_epi32(vecSum3, vecK3);
vecK0 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+4]);
vecK1 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+5]);
vecK2 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+6]);
vecK3 = _mm_broadcastd_epi32(*(xmm_t *)&inputArray[i+7]);
vecSum0 = _mm_add_epi32(vecSum0, vecK0);
vecSum1 = _mm_add_epi32(vecSum1, vecK1);
vecSum2 = _mm_add_epi32(vecSum2, vecK2);
vecSum3 = _mm_add_epi32(vecSum3, vecK3);
IACA_END;
}
vecSum0 = _mm_add_epi32(vecSum0, vecSum1);
vecSum2 = _mm_add_epi32(vecSum2, vecSum3);
vecSum0 = _mm_add_epi32(vecSum0, vecSum2);
uint64_t sum = 0;
sum += _mm_extract_epi32(vecSum0, 0);
sum += _mm_extract_epi32(vecSum0, 1);
sum += _mm_extract_epi32(vecSum0, 2);
sum += _mm_extract_epi32(vecSum0, 3);
return sum;
}
int main(int argc, char **argv) {
uint64_t repeat = 1000 * 10; // number of times outer loop is repeated
uint64_t size = 1000 * 1000; // number of integers in the test array
int c;
while ((c = getopt (argc, argv, "r:s:")) != -1) {
switch (c) {
case 'r':
repeat = atoi(optarg);
break;
case 's':
size = atoi(optarg);
break;
}
}
printf("Allocating input array with '%ld' elements.\n", size);
printf("Repeating function call loop '%ld' times.\n", repeat);
uint32_t *input = memalign(64, size * sizeof(uint32_t));
for (uint64_t i = 0; i < size; i++) {
input[i] = i % 4; // 0, 1, 2, 3, 0, 1, 2, 3, ...
}
uint64_t sum;
likwid_markerInit();
likwid_markerThreadInit();
likwid_markerStartRegion("broadcast");
sum = 0;
for (uint64_t i = 0; i < repeat; i++) {
sum += loop_broadcast_from_memory(input, size);
MEM_BARRIER(); // prevent compiler from optimizing out the loop
}
likwid_markerStopRegion("broadcast");
printf("Broadcast Sum: %ld\n", sum);
likwid_markerStartRegion("shuffle");
sum = 0;
for (uint64_t i = 0; i < repeat; i++) {
sum += loop_movd_and_shuffle(input, size);
MEM_BARRIER(); // prevent compiler from optimizing out the loop
}
likwid_markerStopRegion("shuffle");
printf("Shuffle Sum: %ld\n", sum);
likwid_markerClose();
return 0;
}
#ifdef LIKWID_RESULTS
// gcc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra broadcast.c -o broadcast -DLIKWID -llikwid -lm -lpthread
// likwid-perfctr -m -g UOPS_EXECUTED_PORT_PORT_2:PMC0,UOPS_EXECUTED_PORT_PORT_3:PMC1,UOPS_EXECUTED_PORT_PORT_1:PMC2,UOPS_EXECUTED_PORT_PORT_5:PMC3 -C3 broadcast
Allocating input array with '1000000' elements.
Repeating function call loop '10000' times.
Broadcast Sum: 15000000000
Shuffle Sum: 15000000000
-------------------------------------------------------------
-------------------------------------------------------------
CPU type: Intel Core Haswell processor
CPU clock: 3.39 GHz
-------------------------------------------------------------
broadcast
=====================
Region: broadcast
=====================
+-------------------+----------+
| Region Info | core 3 |
+-------------------+----------+
| RDTSC Runtime [s] | 0.463913 |
| call count | 1 |
+-------------------+----------+
+---------------------------+-------------+
| Event | core 3 |
+---------------------------+-------------+
| UOPS_EXECUTED_PORT_PORT_2 | 1.25005e+09 |
| UOPS_EXECUTED_PORT_PORT_3 | 1.25005e+09 |
| UOPS_EXECUTED_PORT_PORT_1 | 1.28721e+09 |
| UOPS_EXECUTED_PORT_PORT_5 | 1.21291e+09 |
| INSTR_RETIRED_ANY | 6.25031e+09 |
| CPU_CLK_UNHALTED_CORE | 1.57357e+09 |
| CPU_CLK_UNHALTED_REF | 1.57357e+09 |
+---------------------------+-------------+
+----------------------+----------+
| Metric | core 3 |
+----------------------+----------+
| Runtime (RDTSC) [s] | 0.463913 |
| Runtime unhalted [s] | 0.463885 |
| Clock [MHz] | 3392.15 |
| CPI | 0.251758 |
+----------------------+----------+
=====================
Region: shuffle
=====================
+-------------------+----------+
| Region Info | core 3 |
+-------------------+----------+
| RDTSC Runtime [s] | 0.783239 |
| call count | 1 |
+-------------------+----------+
+---------------------------+-------------+
| Event | core 3 |
+---------------------------+-------------+
| UOPS_EXECUTED_PORT_PORT_2 | 3.12518e+08 |
| UOPS_EXECUTED_PORT_PORT_3 | 3.12517e+08 |
| UOPS_EXECUTED_PORT_PORT_1 | 2.34381e+09 |
| UOPS_EXECUTED_PORT_PORT_5 | 2.65637e+09 |
| INSTR_RETIRED_ANY | 6.87531e+09 |
| CPU_CLK_UNHALTED_CORE | 2.65675e+09 |
| CPU_CLK_UNHALTED_REF | 2.65675e+09 |
+---------------------------+-------------+
+----------------------+----------+
| Metric | core 3 |
+----------------------+----------+
| Runtime (RDTSC) [s] | 0.783239 |
| Runtime unhalted [s] | 0.783206 |
| Clock [MHz] | 3392.15 |
| CPI | 0.386419 |
+----------------------+----------+
#endif // LIKWID_RESULTS
#ifdef IACA_RESULTS
// gcc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra -c broadcast.c -DIACA -o iaca.o
// /opt/intel/iaca/bin/iaca -mark 0 -64 -arch HSW -analysis THROUGHPUT iaca.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - iaca.o
Binary Format - 64Bit
Architecture - HSW
Analysis Type - Throughput
*******************************************************************
Intel(R) Architecture Code Analyzer Mark Number 1
*******************************************************************
Throughput Analysis Report
--------------------------
Block Throughput: 8.00 Cycles Throughput Bottleneck: Port1, Port5
Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
---------------------------------------------------------------------------------------
| Cycles | 0.0 0.0 | 8.0 | 1.0 1.0 | 1.0 1.0 | 0.0 | 8.0 | 0.0 | 0.0 |
---------------------------------------------------------------------------------------
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | |
---------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | | vmovdqu xmm1, xmmword ptr [rdi]
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm7, xmm1, 0x0
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm6, xmm1, 0x55
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm5, xmm1, 0xaa
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm1, xmm1, 0xff
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm1, xmm0, xmm1
| 1 | | | | 1.0 1.0 | | | | | | vmovdqu xmm0, xmmword ptr [rdi+0x10]
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm3, xmm3, xmm7
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm4, xmm4, xmm6
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm5
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm8, xmm0, 0x0
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm7, xmm0, 0x55
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm6, xmm0, 0xaa
| 1 | | | | | | 1.0 | | | CP | vpshufd xmm5, xmm0, 0xff
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm3, xmm3, xmm8
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm4, xmm4, xmm7
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm6
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm0, xmm1, xmm5
Total Num Of Uops: 18
*******************************************************************
Intel(R) Architecture Code Analyzer Mark Number 2
*******************************************************************
Throughput Analysis Report
--------------------------
Block Throughput: 4.00 Cycles Throughput Bottleneck: FrontEnd, Port1, PORT2_AGU, Port2_DATA, PORT3_AGU, Port3_DATA, Port5
Port Binding In Cycles Per Iteration:
---------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
---------------------------------------------------------------------------------------
| Cycles | 0.0 0.0 | 4.0 | 4.0 4.0 | 4.0 4.0 | 0.0 | 4.0 | 0.0 | 0.0 |
---------------------------------------------------------------------------------------
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | |
---------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm7, dword ptr [rdi]
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm6, dword ptr [rdi+0x4]
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm5, dword ptr [rdi+0x8]
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm7
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm4, dword ptr [rdi+0xc]
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm3, xmm3, xmm6
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm0, xmm0, xmm5
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm7, dword ptr [rdi+0x10]
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm1, xmm1, xmm4
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm6, dword ptr [rdi+0x14]
| 1 | | | 1.0 1.0 | | | | | | CP | vpbroadcastd xmm5, dword ptr [rdi+0x18]
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm2, xmm2, xmm7
| 1 | | | | 1.0 1.0 | | | | | CP | vpbroadcastd xmm4, dword ptr [rdi+0x1c]
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm3, xmm3, xmm6
| 1 | | 1.0 | | | | | | | CP | vpaddd xmm0, xmm0, xmm5
| 1 | | | | | | 1.0 | | | CP | vpaddd xmm1, xmm1, xmm4
Total Num Of Uops: 16
#endif // IACA_RESULTS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment