nkurz

## compare.sh
for CC in "gcc-4.8" \
	      "gcc-4.9" \
	      "gcc-5" "gcc-5 -flto" \
	      "gcc-6" "gcc-6 -flto" \
	      "icc-16" "icc-16 -ipo" \
	      "icc-17" "icc-17 -ipo" \
	      "clang-4.0" "clang-4.0 -flto"
do
    for CFLAGS in "-O0" "-O0 -march=native" \
			"-O1" "-O1 -march=native" \

## fft-asm.txt
gcc-4.8 -O0 fft-test-asm
Self-test passed
     Size    Time per FFT (ns)
        4    min=18  mean=18  sd=0.01%
       16    min=47  mean=48  sd=0.02%
       64    min=176  mean=176  sd=0.08%
      256    min=896  mean=896  sd=0.04%
     1024    min=4790  mean=4794  sd=0.07%
     4096    min=32083  mean=32162  sd=0.13%
    16384    min=194866  mean=194966  sd=0.03%

## fft-model.txt
gcc-4.8 -O0 fft-test-model
Self-test passed
     Size    Time per FFT (ns)
        4    min=47  mean=47  sd=0.01%
       16    min=411  mean=412  sd=0.07%
       64    min=2524  mean=2525  sd=0.03%
      256    min=13656  mean=13661  sd=0.02%
     1024    min=69013  mean=69024  sd=0.01%
     4096    min=337457  mean=337591  sd=0.02%
    16384    min=1584876  mean=1585286  sd=0.02%

## fft-portable.txt
gcc-4.8 -O0 fft-test-portable
Self-test passed
     Size    Time per FFT (ns)
        4    min=84  mean=84  sd=0.01%
       16    min=540  mean=541  sd=0.03%
       64    min=3035  mean=3037  sd=0.13%
      256    min=15759  mean=15763  sd=0.02%
     1024    min=77969  mean=77984  sd=0.01%
     4096    min=375086  mean=375292  sd=0.04%
    16384    min=1765013  mean=1765401  sd=0.02%

## Results Haswell
nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
-------------------------------------------------------------
-------------------------------------------------------------
CPU type:	Intel Core Haswell processor
CPU clock:	3.39 GHz
-------------------------------------------------------------
fusion
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999

## lxdetectirq_trace_function_graph.txt
/* function_graph trace of one iteration from http://nicst.de/bench-user-irq-detect.html */

/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
/*	ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
381887.380150 |   3)               |  syscall_trace_enter_phase1() {
381887.380150 |   3)               |    context_tracking_user_exit() {
381887.380151 |   3)               |      context_tracking_exit() {
381887.380151 |   3)   0.025 us    |        context_tracking_recursion_enter();
381887.380151 |   3)   0.026 us    |        rcu_user_exit();
381887.380151 |   3)               |        vtime_account_user() {

## lxdetectirq_trace_function.txt
/* function trace of one iteration from http://nicst.de/bench-user-irq-detect.html */

/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
/*	ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
           timer-5372  [003] .... 382853.609575: syscall_trace_enter_phase1 <-tracesys
           timer-5372  [003] .... 382853.609575: context_tracking_user_exit <-syscall_trace_enter_phase1
           timer-5372  [003] .... 382853.609575: context_tracking_exit <-context_tracking_user_exit
           timer-5372  [003] d... 382853.609576: context_tracking_recursion_enter <-context_tracking_exit
           timer-5372  [003] d... 382853.609576: rcu_user_exit <-context_tracking_exit
           timer-5372  [003] d... 382853.609576: vtime_account_user <-context_tracking_exit

## avx.c
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native avx.c -o avx

#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
#include <malloc.h>

## l1d.c
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native l1d.c -o l1d

#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>

## bestAllocation.R
bestAllocation = function(treatedList=c(0,1,8,39,152),  # treated in each category
                          totalsList=rep(200, 5),       # treated + untreated in each
                          numToAdd=100) {               # number new treated available
    addedList = rep(0, length(treatedList))  # start with nothing added
    while (numToAdd > 0) {
        ratio = (treatedList + addedList) / (totalsList + addedList)
        lowest = which.min(ratio)
        addedList[[lowest]] = addedList[[lowest]] + 1
        numToAdd = numToAdd - 1
    }
	for CC in "gcc-4.8" \
	"gcc-4.9" \
	"gcc-5" "gcc-5 -flto" \
	"gcc-6" "gcc-6 -flto" \
	"icc-16" "icc-16 -ipo" \
	"icc-17" "icc-17 -ipo" \
	"clang-4.0" "clang-4.0 -flto"
	do
	for CFLAGS in "-O0" "-O0 -march=native" \
	"-O1" "-O1 -march=native" \
	gcc-4.8 -O0 fft-test-asm
	Self-test passed
	Size Time per FFT (ns)
	4 min=18 mean=18 sd=0.01%
	16 min=47 mean=48 sd=0.02%
	64 min=176 mean=176 sd=0.08%
	256 min=896 mean=896 sd=0.04%
	1024 min=4790 mean=4794 sd=0.07%
	4096 min=32083 mean=32162 sd=0.13%
	16384 min=194866 mean=194966 sd=0.03%
	gcc-4.8 -O0 fft-test-model
	Self-test passed
	Size Time per FFT (ns)
	4 min=47 mean=47 sd=0.01%
	16 min=411 mean=412 sd=0.07%
	64 min=2524 mean=2525 sd=0.03%
	256 min=13656 mean=13661 sd=0.02%
	1024 min=69013 mean=69024 sd=0.01%
	4096 min=337457 mean=337591 sd=0.02%
	16384 min=1584876 mean=1585286 sd=0.02%
	gcc-4.8 -O0 fft-test-portable
	Self-test passed
	Size Time per FFT (ns)
	4 min=84 mean=84 sd=0.01%
	16 min=540 mean=541 sd=0.03%
	64 min=3035 mean=3037 sd=0.13%
	256 min=15759 mean=15763 sd=0.02%
	1024 min=77969 mean=77984 sd=0.01%
	4096 min=375086 mean=375292 sd=0.04%
	16384 min=1765013 mean=1765401 sd=0.02%
	nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
	-------------------------------------------------------------
	-------------------------------------------------------------
	CPU type: Intel Core Haswell processor
	CPU clock: 3.39 GHz
	-------------------------------------------------------------
	fusion
	two_micro_two_macro: sum1=10000000, sum2=9999999
	one_micro_two_macro: sum1=10000000, sum2=9999999
	one_micro_one_macro: sum1=10000000, sum2=9999999
	/* function_graph trace of one iteration from http://nicst.de/bench-user-irq-detect.html */

	/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
	/* ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
	381887.380150 \| 3) \| syscall_trace_enter_phase1() {
	381887.380150 \| 3) \| context_tracking_user_exit() {
	381887.380151 \| 3) \| context_tracking_exit() {
	381887.380151 \| 3) 0.025 us \| context_tracking_recursion_enter();
	381887.380151 \| 3) 0.026 us \| rcu_user_exit();
	381887.380151 \| 3) \| vtime_account_user() {
	/* function trace of one iteration from http://nicst.de/bench-user-irq-detect.html */

	/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
	/* ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
	timer-5372 [003] .... 382853.609575: syscall_trace_enter_phase1 <-tracesys
	timer-5372 [003] .... 382853.609575: context_tracking_user_exit <-syscall_trace_enter_phase1
	timer-5372 [003] .... 382853.609575: context_tracking_exit <-context_tracking_user_exit
	timer-5372 [003] d... 382853.609576: context_tracking_recursion_enter <-context_tracking_exit
	timer-5372 [003] d... 382853.609576: rcu_user_exit <-context_tracking_exit
	timer-5372 [003] d... 382853.609576: vtime_account_user <-context_tracking_exit
	// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native avx.c -o avx

	#include <sys/types.h>
	#include <stdint.h>
	#include <string.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <x86intrin.h>
	#include <math.h>
	#include <malloc.h>
	bestAllocation = function(treatedList=c(0,1,8,39,152), # treated in each category
	totalsList=rep(200, 5), # treated + untreated in each
	numToAdd=100) { # number new treated available
	addedList = rep(0, length(treatedList)) # start with nothing added
	while (numToAdd > 0) {
	ratio = (treatedList + addedList) / (totalsList + addedList)
	lowest = which.min(ratio)
	addedList[[lowest]] = addedList[[lowest]] + 1
	numToAdd = numToAdd - 1
	}