Skip to content

Instantly share code, notes, and snippets.

for CC in "gcc-4.8" \
"gcc-4.9" \
"gcc-5" "gcc-5 -flto" \
"gcc-6" "gcc-6 -flto" \
"icc-16" "icc-16 -ipo" \
"icc-17" "icc-17 -ipo" \
"clang-4.0" "clang-4.0 -flto"
do
for CFLAGS in "-O0" "-O0 -march=native" \
"-O1" "-O1 -march=native" \
gcc-4.8 -O0 fft-test-asm
Self-test passed
Size Time per FFT (ns)
4 min=18 mean=18 sd=0.01%
16 min=47 mean=48 sd=0.02%
64 min=176 mean=176 sd=0.08%
256 min=896 mean=896 sd=0.04%
1024 min=4790 mean=4794 sd=0.07%
4096 min=32083 mean=32162 sd=0.13%
16384 min=194866 mean=194966 sd=0.03%
gcc-4.8 -O0 fft-test-model
Self-test passed
Size Time per FFT (ns)
4 min=47 mean=47 sd=0.01%
16 min=411 mean=412 sd=0.07%
64 min=2524 mean=2525 sd=0.03%
256 min=13656 mean=13661 sd=0.02%
1024 min=69013 mean=69024 sd=0.01%
4096 min=337457 mean=337591 sd=0.02%
16384 min=1584876 mean=1585286 sd=0.02%
gcc-4.8 -O0 fft-test-portable
Self-test passed
Size Time per FFT (ns)
4 min=84 mean=84 sd=0.01%
16 min=540 mean=541 sd=0.03%
64 min=3035 mean=3037 sd=0.13%
256 min=15759 mean=15763 sd=0.02%
1024 min=77969 mean=77984 sd=0.01%
4096 min=375086 mean=375292 sd=0.04%
16384 min=1765013 mean=1765401 sd=0.02%
@nkurz
nkurz / Results Haswell
Created July 12, 2016 03:37
Differences in macro- and micro-fusion performance Skylake vs Haswell
nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
-------------------------------------------------------------
-------------------------------------------------------------
CPU type: Intel Core Haswell processor
CPU clock: 3.39 GHz
-------------------------------------------------------------
fusion
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999
/* function_graph trace of one iteration from http://nicst.de/bench-user-irq-detect.html */
/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
/* ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
381887.380150 | 3) | syscall_trace_enter_phase1() {
381887.380150 | 3) | context_tracking_user_exit() {
381887.380151 | 3) | context_tracking_exit() {
381887.380151 | 3) 0.025 us | context_tracking_recursion_enter();
381887.380151 | 3) 0.026 us | rcu_user_exit();
381887.380151 | 3) | vtime_account_user() {
/* function trace of one iteration from http://nicst.de/bench-user-irq-detect.html */
/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
/* ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
timer-5372 [003] .... 382853.609575: syscall_trace_enter_phase1 <-tracesys
timer-5372 [003] .... 382853.609575: context_tracking_user_exit <-syscall_trace_enter_phase1
timer-5372 [003] .... 382853.609575: context_tracking_exit <-context_tracking_user_exit
timer-5372 [003] d... 382853.609576: context_tracking_recursion_enter <-context_tracking_exit
timer-5372 [003] d... 382853.609576: rcu_user_exit <-context_tracking_exit
timer-5372 [003] d... 382853.609576: vtime_account_user <-context_tracking_exit
@nkurz
nkurz / avx.c
Created December 27, 2015 23:41
Alignment strongly affects vector load bandwidth
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native avx.c -o avx
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
#include <malloc.h>
@nkurz
nkurz / l1d.c
Created December 26, 2015 23:32
Are sustained loads of 64B per cycle possible on Haswell and Skylake?
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native l1d.c -o l1d
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
bestAllocation = function(treatedList=c(0,1,8,39,152), # treated in each category
totalsList=rep(200, 5), # treated + untreated in each
numToAdd=100) { # number new treated available
addedList = rep(0, length(treatedList)) # start with nothing added
while (numToAdd > 0) {
ratio = (treatedList + addedList) / (totalsList + addedList)
lowest = which.min(ratio)
addedList[[lowest]] = addedList[[lowest]] + 1
numToAdd = numToAdd - 1
}